Postwriter / utils.py
jsemrau's picture
split utils and ui
ed6e021
from tldextract import extract
from urllib.parse import quote_plus
import requests
from bs4 import BeautifulSoup
from io import BytesIO
from PyPDF2 import PdfReader
import urllib.parse
def clean_url(url):
"""Clean tracking parameters from URLs"""
url = url.split('&')[0]
url= url.rstrip('/')
# Decode the path to fix encoded '?' or '=' that belong to the path, not query
fixed_url = urllib.parse.unquote(url)
return fixed_url
def get_body(url):
"""Extract article content from URL"""
body_text = ""
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
if url.endswith(".pdf") or "arxiv.org/pdf/" in url:
# PDF content
with BytesIO(response.content) as f:
reader = PdfReader(f)
text_parts = []
for page in reader.pages:
text_parts.append(page.extract_text() or "")
body_text = "\n".join(text_parts)
else:
# HTML content
soup = BeautifulSoup(response.text, "html.parser")
paragraphs = soup.find_all(["p"])
body_text = " ".join([p.get_text(strip=True) for p in paragraphs])
except Exception as e:
print(f"Failed to fetch {url}: {e}")
return body_text
def ner_tagger(text, model):
"""Extract named entities from text"""
labels = ["Source", "Financial Metric", "Date", "Organization", "Person", "Product", "Percentage", "Monetary Value", "Duration"]
entities = model.predict_entities(text, labels, threshold=0.1)
return entities
def remove_duplicate_relationships(data: str) -> str:
"""Remove duplicate relationships from knowledge graph"""
lines = data.splitlines()
triples = []
subject = None
for line in lines:
parts = [part.strip() for part in line.split("-->")]
if len(parts) != 3:
continue
else:
if len(parts[0]) > 0:
subject = parts[0]
predicate = parts[1]
obj = parts[2]
triples.append((subject, predicate, obj))
unique_triples = sorted(set(triples))
grouped = defaultdict(list)
for subj, pred, obj in unique_triples:
grouped[subj].append(f" -->{pred}--> {obj}")
output_lines = []
for subj in grouped:
output_lines.append(subj)
output_lines.extend(grouped[subj])
return '\n'.join(output_lines)