from tldextract import extract from urllib.parse import quote_plus import requests from bs4 import BeautifulSoup from io import BytesIO from PyPDF2 import PdfReader import urllib.parse def clean_url(url): """Clean tracking parameters from URLs""" url = url.split('&')[0] url= url.rstrip('/') # Decode the path to fix encoded '?' or '=' that belong to the path, not query fixed_url = urllib.parse.unquote(url) return fixed_url def get_body(url): """Extract article content from URL""" body_text = "" try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", } headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36" } response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() if url.endswith(".pdf") or "arxiv.org/pdf/" in url: # PDF content with BytesIO(response.content) as f: reader = PdfReader(f) text_parts = [] for page in reader.pages: text_parts.append(page.extract_text() or "") body_text = "\n".join(text_parts) else: # HTML content soup = BeautifulSoup(response.text, "html.parser") paragraphs = soup.find_all(["p"]) body_text = " ".join([p.get_text(strip=True) for p in paragraphs]) except Exception as e: print(f"Failed to fetch {url}: {e}") return body_text def ner_tagger(text, model): """Extract named entities from text""" labels = ["Source", "Financial Metric", "Date", "Organization", "Person", "Product", "Percentage", "Monetary Value", "Duration"] entities = model.predict_entities(text, labels, threshold=0.1) return entities def remove_duplicate_relationships(data: str) -> str: """Remove duplicate relationships from knowledge graph""" lines = data.splitlines() triples = [] subject = None for line in lines: parts = [part.strip() for part in line.split("-->")] if len(parts) != 3: continue else: if len(parts[0]) > 0: subject = parts[0] predicate = parts[1] obj = parts[2] triples.append((subject, predicate, obj)) unique_triples = sorted(set(triples)) grouped = defaultdict(list) for subj, pred, obj in unique_triples: grouped[subj].append(f" -->{pred}--> {obj}") output_lines = [] for subj in grouped: output_lines.append(subj) output_lines.extend(grouped[subj]) return '\n'.join(output_lines)