import gradio as gr import time from datetime import date, timedelta, timezone, datetime import os import pandas as pd import numpy as np import logging import requests from bs4 import BeautifulSoup import urllib.parse import dateutil.parser from dateutil import parser as dateutil_parser from tldextract import extract from urllib.parse import quote_plus from collections import defaultdict from dotenv import load_dotenv from GoogleNews import GoogleNews import feedparser # Your existing functions (unchanged) def get_google_news(query="AI Agents", cutoff=1): """Get Google News articles based on query""" days = cutoff language = 'en' to_day = datetime.today().strftime('%m/%d/%Y') from_day = (datetime.today() - timedelta(days=days)).strftime('%m/%d/%Y') blackList=' -site:winbuzzer.com -site:x.com -site:threads.com -site:instagram.com -site:linkedin.com -site:facebook.com -site:tiktok.com -site:reddit.com -site:youtube.com -site:newser.com -site:adexchanger.com -india -crypto -blockchain -bitcoin -DeFi' tQuery=query+blackList str_div = [] print(f"Assembling news with cutoff {cutoff} for query: {str(tQuery)} ") try: googlenews = GoogleNews(start=from_day, end=to_day, lang=language) googlenews.search(tQuery) page1 = googlenews.result() df = pd.DataFrame(page1) time_cutoff = datetime.now() - timedelta(days=cutoff) for index, row in df.iterrows(): try: news_time = dateutil.parser.parse(str(row['datetime'])) if news_time >= time_cutoff: domain = extract(row['link']).domain str_a = row.to_dict() str_a['datetime'] = str(news_time) str_a.update({'domain': domain}) str_div.append(str_a) else: print(f" Skipping {news_time} > {time_cutoff}") except Exception as inner_e: print(f"Error parsing datetime for row {index}: {inner_e}") continue except Exception as e: print("Error aggregating news " + str(e)) return str_div def resolve_redirect(url): try: response = requests.head(url, allow_redirects=True, timeout=5) return response.url except Exception as e: print(f"Redirect failed: {e}") return url def get_google_news_new(query="AI Agents", cutoff=1): """Get Google News articles based on query using RSS feed, output similar to GoogleNews package""" results = [] print("Assembling news for " + str(query)) try: # Create RSS URL with proper encoding encoded_query = query.replace(' ', '+') url = f"https://news.google.com/rss/search?q={encoded_query}" # Parse the RSS feed feed = feedparser.parse(url) # Set time cutoff time_cutoff = datetime.now(timezone.utc) - timedelta(days=cutoff) for entry in feed.entries: try: # Parse the published or updated date if hasattr(entry, 'published'): news_time = dateutil_parser.parse(entry.published) elif hasattr(entry, 'updated'): news_time = dateutil_parser.parse(entry.updated) else: continue # Skip old articles if news_time < time_cutoff: continue # Resolve final article URL final_url = resolve_redirect(entry.link) if hasattr(entry, 'link') else '' # Estimate relative time (like '3 hours ago') time_diff = datetime.now(timezone.utc) - news_time if time_diff.days > 0: relative_date = f"{time_diff.days} days ago" elif time_diff.seconds >= 3600: relative_date = f"{time_diff.seconds // 3600} hours ago" else: relative_date = f"{time_diff.seconds // 60} minutes ago" # Extract domain for media name domain_parts = extract(final_url) media = domain_parts.domain.capitalize() if domain_parts.domain else "Unknown" # Build result dict article_dict = { 'title': entry.title if hasattr(entry, 'title') else '', 'media': media, 'domain': media, 'date': relative_date, 'datetime': news_time, 'link': final_url, 'desc': entry.summary if hasattr(entry, 'summary') else '', 'img': getattr(entry, 'media_content', [{}])[0].get('url', '') if hasattr(entry, 'media_content') else '' } print(f"{article_dict}\n") results.append(article_dict) except Exception as inner_e: print(f"Error parsing entry: {inner_e}") continue except Exception as e: print("Error aggregating news " + str(e)) print(f"I found {len(results)} items.") return results import requests import pandas as pd from datetime import datetime, timedelta from urllib.parse import urlparse import time def get_newsapi_articles(query="AI Agents", cutoff=1, api_key=None): """ Get news articles from NewsAPI.org (Free tier: 1000 requests/month) Sign up at: https://newsapi.org/ """ if not api_key: print("NewsAPI requires an API key. Sign up at https://newsapi.org/") return [] days = cutoff from_date = (datetime.today() - timedelta(days=days)).strftime('%Y-%m-%d') newsapi_key=os.getenv('NEWSAPI') url = "https://newsapi.org/v2/everything" params = { 'q': query, 'from': from_date, 'sortBy': 'publishedAt', 'language': 'en', 'apiKey': newsapi_key, 'pageSize': 50 } try: response = requests.get(url, params=params) print(response) response.raise_for_status() data = response.json() articles = [] for article in data.get('articles', []): domain = urlparse(article['url']).netloc articles.append({ 'title': article['title'], 'link': article['url'], 'date': article['publishedAt'][:10], 'datetime': article['publishedAt'], 'desc': article['description'] or '', 'domain': domain, 'source': article['source']['name'] }) return articles except Exception as e: print(f"Error fetching from NewsAPI: {e}") return [] def get_gnews_articles(query="AI Agents", cutoff=1): """ Get news articles from GNews (No API key required, but has rate limits) Completely free but limited to 100 requests per day """ import json days = cutoff api_key=os.environ['GNEWSAPI'] from_date = (datetime.today() - timedelta(days=days)).strftime('%Y-%m-%d') url = "https://gnews.io/api/v4/search" #https://gnews.io/api/v4/search?q=Google&lang=en&max=5&apikey=YOUR_API_KEY params = { 'q': query, 'apikey':api_key, 'lang': 'en', 'max': 25, 'from': from_date + 'T00:00:00Z', 'to': datetime.today().strftime('%Y-%m-%d') + 'T23:59:59Z', } try: response = requests.get(url, params=params) response.raise_for_status() data = response.json() with open('data_output.json', 'w') as f: json.dump(response.json(), f, indent=2) except Exception as e: print(f"Error fetching from GNews: {e}") return [] articles = data.get('articles', []) rArticles=[] for article in articles: #try: link= article.get('url', "") domain = urlparse(link).netloc rArticles.append({ 'title': article['title'], 'link': article.get('url', ""), 'date': article.get('publishedAt', ""), 'datetime': article.get('publishedAt', ""), 'desc': article.get('description', ""), 'domain': domain, 'media': domain, 'source': article['source']['name'], }) #except Exception as e: # print(f"Error preparing from GNews: {e}") # continue return rArticles def get_arxiv_papers(query="AI Agents", cutoff=7, max_results=25): """ Get recent papers from Arxiv for a given keyword. Uses the Arxiv API (no API key required). Args: query (str): Search keyword(s). cutoff (int): How many days back to search. max_results (int): Maximum number of results to return. Returns: list of dicts with paper metadata. """ import json import requests from datetime import datetime, timedelta from urllib.parse import urlencode import xml.etree.ElementTree as ET # Calculate date range from_date = (datetime.today() - timedelta(days=cutoff)).strftime('%Y%m%d%H%M%S') to_date = datetime.today().strftime('%Y%m%d%H%M%S') # Arxiv API endpoint base_url = "http://export.arxiv.org/api/query?" if not isinstance(max_results, int) or max_results <= 0: max_results = 25 # fallback to safe default # Construct query (Arxiv search syntax: all:keyword) search_query = f"all:{query}" params = { "search_query": search_query, "start": 0, "max_results": max_results, "sortBy": "submittedDate", "sortOrder": "descending", } url = base_url + urlencode(params) try: response = requests.get(url, timeout=10) response.raise_for_status() root = ET.fromstring(response.text) except Exception as e: print(f"Error fetching from Arxiv: {e}") return [] print(response) ns = {"atom": "http://www.w3.org/2005/Atom"} papers = [] for entry in root.findall("atom:entry", ns): published = entry.find("atom:published", ns).text published_dt = datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ") # Filter by cutoff if published_dt < (datetime.today() - timedelta(days=cutoff)): continue link = entry.find("atom:id", ns).text pdf_link = link.replace("/abs/", "/pdf/") title = entry.find("atom:title", ns).text.strip() summary = entry.find("atom:summary", ns).text.strip() authors = [author.find("atom:name", ns).text for author in entry.findall("atom:author", ns)] papers.append({ "title": title, "link": pdf_link, "date": published, "datetime": published_dt.isoformat(), "desc": summary, "authors": authors, "source": "arXiv", "domain": "arxiv.org", "media": "arxiv.org", }) # "link": "http://arxiv.org/abs/2509.09656v1", #https://arxiv.org/pdf/2509.09656v1 # Optional: save to JSON with open("arxiv_output.json", "w") as f: json.dump(papers, f, indent=2) return papers def get_rss_feed_articles(rss_url, query="AI Agents", cutoff=1): """ Parse RSS feeds for news articles (Completely free) Example RSS feeds: - BBC: http://feeds.bbci.co.uk/news/rss.xml - Reuters: http://feeds.reuters.com/reuters/topNews - AP News: https://rsshub.app/ap/topics/apf-topnews """ try: import feedparser feed = feedparser.parse(rss_url) articles = [] time_cutoff = datetime.now() - timedelta(days=cutoff) for entry in feed.entries: # Simple keyword matching if query.lower() in entry.title.lower() or query.lower() in entry.get('summary', '').lower(): try: # Parse publication date pub_date = datetime(*entry.published_parsed[:6]) if pub_date >= time_cutoff: domain = urlparse(entry.link).netloc articles.append({ 'title': entry.title, 'link': entry.link, 'date': pub_date.strftime('%Y-%m-%d'), 'datetime': pub_date.isoformat(), 'desc': entry.get('summary', '')[:200] + '...' if len(entry.get('summary', '')) > 200 else entry.get('summary', ''), 'domain': domain, 'source': feed.feed.get('title', 'RSS Feed') }) except: continue return articles except ImportError: print("RSS parsing requires feedparser: pip install feedparser") return [] except Exception as e: print(f"Error parsing RSS feed: {e}") return [] # Example usage function that mirrors your original structure def get_news_articles(query="AI Agents", cutoff_days=1, api_choice="newsapi", api_key=None): """ Main function to get news articles from various sources Args: query: Search term cutoff_days: How many days back to search api_choice: 'newsapi', 'guardian', 'currents', 'gnews', or 'rss' api_key: API key if required """ if api_choice == "newsapi": news_articles = get_newsapi_articles(query, cutoff_days, api_key) elif api_choice == "arxiv": news_articles = get_arxiv_papers(query, 90, 10) elif api_choice == "gnews": news_articles = get_gnews_articles(query, cutoff_days) elif api_choice == "rss": # Example with BBC RSS feed rss_url = "http://feeds.bbci.co.uk/news/technology/rss.xml" news_articles = get_rss_feed_articles(rss_url, query, cutoff_days) else: print("Invalid API choice") return [], pd.DataFrame() if not news_articles: return "No news articles found for the given query and time period.", pd.DataFrame() # Create DataFrame for display (matching your original structure) display_data = [] for i, article in enumerate(news_articles): display_data.append({ 'Index': i, 'Title': article['title'], 'Link': article['link'], 'Date': article['date'], 'Description': article['desc'][:100] + "..." if len(article['desc']) > 100 else article['desc'], 'Domain': article['domain'] }) return news_articles, pd.DataFrame(display_data)