Spaces:

jsemrau
/

Postwriter

Runtime error

App Files Files Community

Postwriter / utils.py

jsemrau

split utils and ui

ed6e021 3 months ago

raw

history blame contribute delete

3.17 kB

	from tldextract import extract
	from urllib.parse import quote_plus
	import requests
	from bs4 import BeautifulSoup
	from io import BytesIO
	from PyPDF2 import PdfReader
	import urllib.parse

	def clean_url(url):
	"""Clean tracking parameters from URLs"""
	url = url.split('&')[0]
	url= url.rstrip('/')
	# Decode the path to fix encoded '?' or '=' that belong to the path, not query
	fixed_url = urllib.parse.unquote(url)

	return fixed_url

	def get_body(url):
	"""Extract article content from URL"""
	body_text = ""
	try:
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8",
	"Accept-Language": "en-US,en;q=0.5",
	"Accept-Encoding": "gzip, deflate",
	"Connection": "keep-alive",
	"Upgrade-Insecure-Requests": "1",
	}

	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
	}

	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()

	if url.endswith(".pdf") or "arxiv.org/pdf/" in url:
	# PDF content
	with BytesIO(response.content) as f:
	reader = PdfReader(f)
	text_parts = []
	for page in reader.pages:
	text_parts.append(page.extract_text() or "")
	body_text = "\n".join(text_parts)
	else:
	# HTML content
	soup = BeautifulSoup(response.text, "html.parser")
	paragraphs = soup.find_all(["p"])
	body_text = " ".join([p.get_text(strip=True) for p in paragraphs])


	except Exception as e:
	print(f"Failed to fetch {url}: {e}")

	return body_text

	def ner_tagger(text, model):
	"""Extract named entities from text"""
	labels = ["Source", "Financial Metric", "Date", "Organization", "Person", "Product", "Percentage", "Monetary Value", "Duration"]
	entities = model.predict_entities(text, labels, threshold=0.1)
	return entities

	def remove_duplicate_relationships(data: str) -> str:
	"""Remove duplicate relationships from knowledge graph"""
	lines = data.splitlines()
	triples = []
	subject = None

	for line in lines:
	parts = [part.strip() for part in line.split("-->")]
	if len(parts) != 3:
	continue
	else:
	if len(parts[0]) > 0:
	subject = parts[0]
	predicate = parts[1]
	obj = parts[2]

	triples.append((subject, predicate, obj))

	unique_triples = sorted(set(triples))

	grouped = defaultdict(list)
	for subj, pred, obj in unique_triples:
	grouped[subj].append(f" -->{pred}--> {obj}")

	output_lines = []
	for subj in grouped:
	output_lines.append(subj)
	output_lines.extend(grouped[subj])

	return '\n'.join(output_lines)