Spaces:
Runtime error
Runtime error
| from tldextract import extract | |
| from urllib.parse import quote_plus | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from io import BytesIO | |
| from PyPDF2 import PdfReader | |
| import urllib.parse | |
| def clean_url(url): | |
| """Clean tracking parameters from URLs""" | |
| url = url.split('&')[0] | |
| url= url.rstrip('/') | |
| # Decode the path to fix encoded '?' or '=' that belong to the path, not query | |
| fixed_url = urllib.parse.unquote(url) | |
| return fixed_url | |
| def get_body(url): | |
| """Extract article content from URL""" | |
| body_text = "" | |
| try: | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", | |
| "Accept-Language": "en-US,en;q=0.5", | |
| "Accept-Encoding": "gzip, deflate", | |
| "Connection": "keep-alive", | |
| "Upgrade-Insecure-Requests": "1", | |
| } | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36" | |
| } | |
| response = requests.get(url, headers=headers, timeout=10) | |
| response.raise_for_status() | |
| if url.endswith(".pdf") or "arxiv.org/pdf/" in url: | |
| # PDF content | |
| with BytesIO(response.content) as f: | |
| reader = PdfReader(f) | |
| text_parts = [] | |
| for page in reader.pages: | |
| text_parts.append(page.extract_text() or "") | |
| body_text = "\n".join(text_parts) | |
| else: | |
| # HTML content | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| paragraphs = soup.find_all(["p"]) | |
| body_text = " ".join([p.get_text(strip=True) for p in paragraphs]) | |
| except Exception as e: | |
| print(f"Failed to fetch {url}: {e}") | |
| return body_text | |
| def ner_tagger(text, model): | |
| """Extract named entities from text""" | |
| labels = ["Source", "Financial Metric", "Date", "Organization", "Person", "Product", "Percentage", "Monetary Value", "Duration"] | |
| entities = model.predict_entities(text, labels, threshold=0.1) | |
| return entities | |
| def remove_duplicate_relationships(data: str) -> str: | |
| """Remove duplicate relationships from knowledge graph""" | |
| lines = data.splitlines() | |
| triples = [] | |
| subject = None | |
| for line in lines: | |
| parts = [part.strip() for part in line.split("-->")] | |
| if len(parts) != 3: | |
| continue | |
| else: | |
| if len(parts[0]) > 0: | |
| subject = parts[0] | |
| predicate = parts[1] | |
| obj = parts[2] | |
| triples.append((subject, predicate, obj)) | |
| unique_triples = sorted(set(triples)) | |
| grouped = defaultdict(list) | |
| for subj, pred, obj in unique_triples: | |
| grouped[subj].append(f" -->{pred}--> {obj}") | |
| output_lines = [] | |
| for subj in grouped: | |
| output_lines.append(subj) | |
| output_lines.extend(grouped[subj]) | |
| return '\n'.join(output_lines) | |