Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import time | |
| from datetime import date, timedelta, timezone, datetime | |
| import os | |
| import pandas as pd | |
| import numpy as np | |
| import logging | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import urllib.parse | |
| import dateutil.parser | |
| from dateutil import parser as dateutil_parser | |
| from tldextract import extract | |
| from urllib.parse import quote_plus | |
| from collections import defaultdict | |
| from dotenv import load_dotenv | |
| from GoogleNews import GoogleNews | |
| import feedparser | |
| # Your existing functions (unchanged) | |
| def get_google_news(query="AI Agents", cutoff=1): | |
| """Get Google News articles based on query""" | |
| days = cutoff | |
| language = 'en' | |
| to_day = datetime.today().strftime('%m/%d/%Y') | |
| from_day = (datetime.today() - timedelta(days=days)).strftime('%m/%d/%Y') | |
| blackList=' -site:winbuzzer.com -site:x.com -site:threads.com -site:instagram.com -site:linkedin.com -site:facebook.com -site:tiktok.com -site:reddit.com -site:youtube.com -site:newser.com -site:adexchanger.com -india -crypto -blockchain -bitcoin -DeFi' | |
| tQuery=query+blackList | |
| str_div = [] | |
| print(f"Assembling news with cutoff {cutoff} for query: {str(tQuery)} ") | |
| try: | |
| googlenews = GoogleNews(start=from_day, end=to_day, lang=language) | |
| googlenews.search(tQuery) | |
| page1 = googlenews.result() | |
| df = pd.DataFrame(page1) | |
| time_cutoff = datetime.now() - timedelta(days=cutoff) | |
| for index, row in df.iterrows(): | |
| try: | |
| news_time = dateutil.parser.parse(str(row['datetime'])) | |
| if news_time >= time_cutoff: | |
| domain = extract(row['link']).domain | |
| str_a = row.to_dict() | |
| str_a['datetime'] = str(news_time) | |
| str_a.update({'domain': domain}) | |
| str_div.append(str_a) | |
| else: | |
| print(f" Skipping {news_time} > {time_cutoff}") | |
| except Exception as inner_e: | |
| print(f"Error parsing datetime for row {index}: {inner_e}") | |
| continue | |
| except Exception as e: | |
| print("Error aggregating news " + str(e)) | |
| return str_div | |
| def resolve_redirect(url): | |
| try: | |
| response = requests.head(url, allow_redirects=True, timeout=5) | |
| return response.url | |
| except Exception as e: | |
| print(f"Redirect failed: {e}") | |
| return url | |
| def get_google_news_new(query="AI Agents", cutoff=1): | |
| """Get Google News articles based on query using RSS feed, output similar to GoogleNews package""" | |
| results = [] | |
| print("Assembling news for " + str(query)) | |
| try: | |
| # Create RSS URL with proper encoding | |
| encoded_query = query.replace(' ', '+') | |
| url = f"https://news.google.com/rss/search?q={encoded_query}" | |
| # Parse the RSS feed | |
| feed = feedparser.parse(url) | |
| # Set time cutoff | |
| time_cutoff = datetime.now(timezone.utc) - timedelta(days=cutoff) | |
| for entry in feed.entries: | |
| try: | |
| # Parse the published or updated date | |
| if hasattr(entry, 'published'): | |
| news_time = dateutil_parser.parse(entry.published) | |
| elif hasattr(entry, 'updated'): | |
| news_time = dateutil_parser.parse(entry.updated) | |
| else: | |
| continue | |
| # Skip old articles | |
| if news_time < time_cutoff: | |
| continue | |
| # Resolve final article URL | |
| final_url = resolve_redirect(entry.link) if hasattr(entry, 'link') else '' | |
| # Estimate relative time (like '3 hours ago') | |
| time_diff = datetime.now(timezone.utc) - news_time | |
| if time_diff.days > 0: | |
| relative_date = f"{time_diff.days} days ago" | |
| elif time_diff.seconds >= 3600: | |
| relative_date = f"{time_diff.seconds // 3600} hours ago" | |
| else: | |
| relative_date = f"{time_diff.seconds // 60} minutes ago" | |
| # Extract domain for media name | |
| domain_parts = extract(final_url) | |
| media = domain_parts.domain.capitalize() if domain_parts.domain else "Unknown" | |
| # Build result dict | |
| article_dict = { | |
| 'title': entry.title if hasattr(entry, 'title') else '', | |
| 'media': media, | |
| 'domain': media, | |
| 'date': relative_date, | |
| 'datetime': news_time, | |
| 'link': final_url, | |
| 'desc': entry.summary if hasattr(entry, 'summary') else '', | |
| 'img': getattr(entry, 'media_content', [{}])[0].get('url', '') if hasattr(entry, 'media_content') else '' | |
| } | |
| print(f"{article_dict}\n") | |
| results.append(article_dict) | |
| except Exception as inner_e: | |
| print(f"Error parsing entry: {inner_e}") | |
| continue | |
| except Exception as e: | |
| print("Error aggregating news " + str(e)) | |
| print(f"I found {len(results)} items.") | |
| return results | |
| import requests | |
| import pandas as pd | |
| from datetime import datetime, timedelta | |
| from urllib.parse import urlparse | |
| import time | |
| def get_newsapi_articles(query="AI Agents", cutoff=1, api_key=None): | |
| """ | |
| Get news articles from NewsAPI.org (Free tier: 1000 requests/month) | |
| Sign up at: https://newsapi.org/ | |
| """ | |
| if not api_key: | |
| print("NewsAPI requires an API key. Sign up at https://newsapi.org/") | |
| return [] | |
| days = cutoff | |
| from_date = (datetime.today() - timedelta(days=days)).strftime('%Y-%m-%d') | |
| newsapi_key=os.getenv('NEWSAPI') | |
| url = "https://newsapi.org/v2/everything" | |
| params = { | |
| 'q': query, | |
| 'from': from_date, | |
| 'sortBy': 'publishedAt', | |
| 'language': 'en', | |
| 'apiKey': newsapi_key, | |
| 'pageSize': 50 | |
| } | |
| try: | |
| response = requests.get(url, params=params) | |
| print(response) | |
| response.raise_for_status() | |
| data = response.json() | |
| articles = [] | |
| for article in data.get('articles', []): | |
| domain = urlparse(article['url']).netloc | |
| articles.append({ | |
| 'title': article['title'], | |
| 'link': article['url'], | |
| 'date': article['publishedAt'][:10], | |
| 'datetime': article['publishedAt'], | |
| 'desc': article['description'] or '', | |
| 'domain': domain, | |
| 'source': article['source']['name'] | |
| }) | |
| return articles | |
| except Exception as e: | |
| print(f"Error fetching from NewsAPI: {e}") | |
| return [] | |
| def get_gnews_articles(query="AI Agents", cutoff=1): | |
| """ | |
| Get news articles from GNews (No API key required, but has rate limits) | |
| Completely free but limited to 100 requests per day | |
| """ | |
| import json | |
| days = cutoff | |
| api_key=os.environ['GNEWSAPI'] | |
| from_date = (datetime.today() - timedelta(days=days)).strftime('%Y-%m-%d') | |
| url = "https://gnews.io/api/v4/search" | |
| #https://gnews.io/api/v4/search?q=Google&lang=en&max=5&apikey=YOUR_API_KEY | |
| params = { | |
| 'q': query, | |
| 'apikey':api_key, | |
| 'lang': 'en', | |
| 'max': 25, | |
| 'from': from_date + 'T00:00:00Z', | |
| 'to': datetime.today().strftime('%Y-%m-%d') + 'T23:59:59Z', | |
| } | |
| try: | |
| response = requests.get(url, params=params) | |
| response.raise_for_status() | |
| data = response.json() | |
| with open('data_output.json', 'w') as f: | |
| json.dump(response.json(), f, indent=2) | |
| except Exception as e: | |
| print(f"Error fetching from GNews: {e}") | |
| return [] | |
| articles = data.get('articles', []) | |
| rArticles=[] | |
| for article in articles: | |
| #try: | |
| link= article.get('url', "") | |
| domain = urlparse(link).netloc | |
| rArticles.append({ | |
| 'title': article['title'], | |
| 'link': article.get('url', ""), | |
| 'date': article.get('publishedAt', ""), | |
| 'datetime': article.get('publishedAt', ""), | |
| 'desc': article.get('description', ""), | |
| 'domain': domain, | |
| 'media': domain, | |
| 'source': article['source']['name'], | |
| }) | |
| #except Exception as e: | |
| # print(f"Error preparing from GNews: {e}") | |
| # continue | |
| return rArticles | |
| def get_arxiv_papers(query="AI Agents", cutoff=7, max_results=25): | |
| """ | |
| Get recent papers from Arxiv for a given keyword. | |
| Uses the Arxiv API (no API key required). | |
| Args: | |
| query (str): Search keyword(s). | |
| cutoff (int): How many days back to search. | |
| max_results (int): Maximum number of results to return. | |
| Returns: | |
| list of dicts with paper metadata. | |
| """ | |
| import json | |
| import requests | |
| from datetime import datetime, timedelta | |
| from urllib.parse import urlencode | |
| import xml.etree.ElementTree as ET | |
| # Calculate date range | |
| from_date = (datetime.today() - timedelta(days=cutoff)).strftime('%Y%m%d%H%M%S') | |
| to_date = datetime.today().strftime('%Y%m%d%H%M%S') | |
| # Arxiv API endpoint | |
| base_url = "http://export.arxiv.org/api/query?" | |
| if not isinstance(max_results, int) or max_results <= 0: | |
| max_results = 25 # fallback to safe default | |
| # Construct query (Arxiv search syntax: all:keyword) | |
| search_query = f"all:{query}" | |
| params = { | |
| "search_query": search_query, | |
| "start": 0, | |
| "max_results": max_results, | |
| "sortBy": "submittedDate", | |
| "sortOrder": "descending", | |
| } | |
| url = base_url + urlencode(params) | |
| try: | |
| response = requests.get(url, timeout=10) | |
| response.raise_for_status() | |
| root = ET.fromstring(response.text) | |
| except Exception as e: | |
| print(f"Error fetching from Arxiv: {e}") | |
| return [] | |
| print(response) | |
| ns = {"atom": "http://www.w3.org/2005/Atom"} | |
| papers = [] | |
| for entry in root.findall("atom:entry", ns): | |
| published = entry.find("atom:published", ns).text | |
| published_dt = datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ") | |
| # Filter by cutoff | |
| if published_dt < (datetime.today() - timedelta(days=cutoff)): | |
| continue | |
| link = entry.find("atom:id", ns).text | |
| pdf_link = link.replace("/abs/", "/pdf/") | |
| title = entry.find("atom:title", ns).text.strip() | |
| summary = entry.find("atom:summary", ns).text.strip() | |
| authors = [author.find("atom:name", ns).text for author in entry.findall("atom:author", ns)] | |
| papers.append({ | |
| "title": title, | |
| "link": pdf_link, | |
| "date": published, | |
| "datetime": published_dt.isoformat(), | |
| "desc": summary, | |
| "authors": authors, | |
| "source": "arXiv", | |
| "domain": "arxiv.org", | |
| "media": "arxiv.org", | |
| }) | |
| # "link": "http://arxiv.org/abs/2509.09656v1", | |
| #https://arxiv.org/pdf/2509.09656v1 | |
| # Optional: save to JSON | |
| with open("arxiv_output.json", "w") as f: | |
| json.dump(papers, f, indent=2) | |
| return papers | |
| def get_rss_feed_articles(rss_url, query="AI Agents", cutoff=1): | |
| """ | |
| Parse RSS feeds for news articles (Completely free) | |
| Example RSS feeds: | |
| - BBC: http://feeds.bbci.co.uk/news/rss.xml | |
| - Reuters: http://feeds.reuters.com/reuters/topNews | |
| - AP News: https://rsshub.app/ap/topics/apf-topnews | |
| """ | |
| try: | |
| import feedparser | |
| feed = feedparser.parse(rss_url) | |
| articles = [] | |
| time_cutoff = datetime.now() - timedelta(days=cutoff) | |
| for entry in feed.entries: | |
| # Simple keyword matching | |
| if query.lower() in entry.title.lower() or query.lower() in entry.get('summary', '').lower(): | |
| try: | |
| # Parse publication date | |
| pub_date = datetime(*entry.published_parsed[:6]) | |
| if pub_date >= time_cutoff: | |
| domain = urlparse(entry.link).netloc | |
| articles.append({ | |
| 'title': entry.title, | |
| 'link': entry.link, | |
| 'date': pub_date.strftime('%Y-%m-%d'), | |
| 'datetime': pub_date.isoformat(), | |
| 'desc': entry.get('summary', '')[:200] + '...' if len(entry.get('summary', '')) > 200 else entry.get('summary', ''), | |
| 'domain': domain, | |
| 'source': feed.feed.get('title', 'RSS Feed') | |
| }) | |
| except: | |
| continue | |
| return articles | |
| except ImportError: | |
| print("RSS parsing requires feedparser: pip install feedparser") | |
| return [] | |
| except Exception as e: | |
| print(f"Error parsing RSS feed: {e}") | |
| return [] | |
| # Example usage function that mirrors your original structure | |
| def get_news_articles(query="AI Agents", cutoff_days=1, api_choice="newsapi", api_key=None): | |
| """ | |
| Main function to get news articles from various sources | |
| Args: | |
| query: Search term | |
| cutoff_days: How many days back to search | |
| api_choice: 'newsapi', 'guardian', 'currents', 'gnews', or 'rss' | |
| api_key: API key if required | |
| """ | |
| if api_choice == "newsapi": | |
| news_articles = get_newsapi_articles(query, cutoff_days, api_key) | |
| elif api_choice == "arxiv": | |
| news_articles = get_arxiv_papers(query, 90, 10) | |
| elif api_choice == "gnews": | |
| news_articles = get_gnews_articles(query, cutoff_days) | |
| elif api_choice == "rss": | |
| # Example with BBC RSS feed | |
| rss_url = "http://feeds.bbci.co.uk/news/technology/rss.xml" | |
| news_articles = get_rss_feed_articles(rss_url, query, cutoff_days) | |
| else: | |
| print("Invalid API choice") | |
| return [], pd.DataFrame() | |
| if not news_articles: | |
| return "No news articles found for the given query and time period.", pd.DataFrame() | |
| # Create DataFrame for display (matching your original structure) | |
| display_data = [] | |
| for i, article in enumerate(news_articles): | |
| display_data.append({ | |
| 'Index': i, | |
| 'Title': article['title'], | |
| 'Link': article['link'], | |
| 'Date': article['date'], | |
| 'Description': article['desc'][:100] + "..." if len(article['desc']) > 100 else article['desc'], | |
| 'Domain': article['domain'] | |
| }) | |
| return news_articles, pd.DataFrame(display_data) |