Postwriter / connector.py
jsemrau's picture
added connector
9e5dff8
raw
history blame
14.9 kB
import gradio as gr
import time
from datetime import date, timedelta, timezone, datetime
import os
import pandas as pd
import numpy as np
import logging
import requests
from bs4 import BeautifulSoup
import urllib.parse
import dateutil.parser
from dateutil import parser as dateutil_parser
from tldextract import extract
from urllib.parse import quote_plus
from collections import defaultdict
from dotenv import load_dotenv
from GoogleNews import GoogleNews
import feedparser
# Your existing functions (unchanged)
def get_google_news(query="AI Agents", cutoff=1):
"""Get Google News articles based on query"""
days = cutoff
language = 'en'
to_day = datetime.today().strftime('%m/%d/%Y')
from_day = (datetime.today() - timedelta(days=days)).strftime('%m/%d/%Y')
blackList=' -site:winbuzzer.com -site:x.com -site:threads.com -site:instagram.com -site:linkedin.com -site:facebook.com -site:tiktok.com -site:reddit.com -site:youtube.com -site:newser.com -site:adexchanger.com -india -crypto -blockchain -bitcoin -DeFi'
tQuery=query+blackList
str_div = []
print(f"Assembling news with cutoff {cutoff} for query: {str(tQuery)} ")
try:
googlenews = GoogleNews(start=from_day, end=to_day, lang=language)
googlenews.search(tQuery)
page1 = googlenews.result()
df = pd.DataFrame(page1)
time_cutoff = datetime.now() - timedelta(days=cutoff)
for index, row in df.iterrows():
try:
news_time = dateutil.parser.parse(str(row['datetime']))
if news_time >= time_cutoff:
domain = extract(row['link']).domain
str_a = row.to_dict()
str_a['datetime'] = str(news_time)
str_a.update({'domain': domain})
str_div.append(str_a)
else:
print(f" Skipping {news_time} > {time_cutoff}")
except Exception as inner_e:
print(f"Error parsing datetime for row {index}: {inner_e}")
continue
except Exception as e:
print("Error aggregating news " + str(e))
return str_div
def resolve_redirect(url):
try:
response = requests.head(url, allow_redirects=True, timeout=5)
return response.url
except Exception as e:
print(f"Redirect failed: {e}")
return url
def get_google_news_new(query="AI Agents", cutoff=1):
"""Get Google News articles based on query using RSS feed, output similar to GoogleNews package"""
results = []
print("Assembling news for " + str(query))
try:
# Create RSS URL with proper encoding
encoded_query = query.replace(' ', '+')
url = f"https://news.google.com/rss/search?q={encoded_query}"
# Parse the RSS feed
feed = feedparser.parse(url)
# Set time cutoff
time_cutoff = datetime.now(timezone.utc) - timedelta(days=cutoff)
for entry in feed.entries:
try:
# Parse the published or updated date
if hasattr(entry, 'published'):
news_time = dateutil_parser.parse(entry.published)
elif hasattr(entry, 'updated'):
news_time = dateutil_parser.parse(entry.updated)
else:
continue
# Skip old articles
if news_time < time_cutoff:
continue
# Resolve final article URL
final_url = resolve_redirect(entry.link) if hasattr(entry, 'link') else ''
# Estimate relative time (like '3 hours ago')
time_diff = datetime.now(timezone.utc) - news_time
if time_diff.days > 0:
relative_date = f"{time_diff.days} days ago"
elif time_diff.seconds >= 3600:
relative_date = f"{time_diff.seconds // 3600} hours ago"
else:
relative_date = f"{time_diff.seconds // 60} minutes ago"
# Extract domain for media name
domain_parts = extract(final_url)
media = domain_parts.domain.capitalize() if domain_parts.domain else "Unknown"
# Build result dict
article_dict = {
'title': entry.title if hasattr(entry, 'title') else '',
'media': media,
'domain': media,
'date': relative_date,
'datetime': news_time,
'link': final_url,
'desc': entry.summary if hasattr(entry, 'summary') else '',
'img': getattr(entry, 'media_content', [{}])[0].get('url', '') if hasattr(entry, 'media_content') else ''
}
print(f"{article_dict}\n")
results.append(article_dict)
except Exception as inner_e:
print(f"Error parsing entry: {inner_e}")
continue
except Exception as e:
print("Error aggregating news " + str(e))
print(f"I found {len(results)} items.")
return results
import requests
import pandas as pd
from datetime import datetime, timedelta
from urllib.parse import urlparse
import time
def get_newsapi_articles(query="AI Agents", cutoff=1, api_key=None):
"""
Get news articles from NewsAPI.org (Free tier: 1000 requests/month)
Sign up at: https://newsapi.org/
"""
if not api_key:
print("NewsAPI requires an API key. Sign up at https://newsapi.org/")
return []
days = cutoff
from_date = (datetime.today() - timedelta(days=days)).strftime('%Y-%m-%d')
newsapi_key=os.getenv('NEWSAPI')
url = "https://newsapi.org/v2/everything"
params = {
'q': query,
'from': from_date,
'sortBy': 'publishedAt',
'language': 'en',
'apiKey': newsapi_key,
'pageSize': 50
}
try:
response = requests.get(url, params=params)
print(response)
response.raise_for_status()
data = response.json()
articles = []
for article in data.get('articles', []):
domain = urlparse(article['url']).netloc
articles.append({
'title': article['title'],
'link': article['url'],
'date': article['publishedAt'][:10],
'datetime': article['publishedAt'],
'desc': article['description'] or '',
'domain': domain,
'source': article['source']['name']
})
return articles
except Exception as e:
print(f"Error fetching from NewsAPI: {e}")
return []
def get_gnews_articles(query="AI Agents", cutoff=1):
"""
Get news articles from GNews (No API key required, but has rate limits)
Completely free but limited to 100 requests per day
"""
import json
days = cutoff
api_key=os.environ['GNEWSAPI']
from_date = (datetime.today() - timedelta(days=days)).strftime('%Y-%m-%d')
url = "https://gnews.io/api/v4/search"
#https://gnews.io/api/v4/search?q=Google&lang=en&max=5&apikey=YOUR_API_KEY
params = {
'q': query,
'apikey':api_key,
'lang': 'en',
'max': 25,
'from': from_date + 'T00:00:00Z',
'to': datetime.today().strftime('%Y-%m-%d') + 'T23:59:59Z',
}
try:
response = requests.get(url, params=params)
response.raise_for_status()
data = response.json()
with open('data_output.json', 'w') as f:
json.dump(response.json(), f, indent=2)
except Exception as e:
print(f"Error fetching from GNews: {e}")
return []
articles = data.get('articles', [])
rArticles=[]
for article in articles:
#try:
link= article.get('url', "")
domain = urlparse(link).netloc
rArticles.append({
'title': article['title'],
'link': article.get('url', ""),
'date': article.get('publishedAt', ""),
'datetime': article.get('publishedAt', ""),
'desc': article.get('description', ""),
'domain': domain,
'media': domain,
'source': article['source']['name'],
})
#except Exception as e:
# print(f"Error preparing from GNews: {e}")
# continue
return rArticles
def get_arxiv_papers(query="AI Agents", cutoff=7, max_results=25):
"""
Get recent papers from Arxiv for a given keyword.
Uses the Arxiv API (no API key required).
Args:
query (str): Search keyword(s).
cutoff (int): How many days back to search.
max_results (int): Maximum number of results to return.
Returns:
list of dicts with paper metadata.
"""
import json
import requests
from datetime import datetime, timedelta
from urllib.parse import urlencode
import xml.etree.ElementTree as ET
# Calculate date range
from_date = (datetime.today() - timedelta(days=cutoff)).strftime('%Y%m%d%H%M%S')
to_date = datetime.today().strftime('%Y%m%d%H%M%S')
# Arxiv API endpoint
base_url = "http://export.arxiv.org/api/query?"
if not isinstance(max_results, int) or max_results <= 0:
max_results = 25 # fallback to safe default
# Construct query (Arxiv search syntax: all:keyword)
search_query = f"all:{query}"
params = {
"search_query": search_query,
"start": 0,
"max_results": max_results,
"sortBy": "submittedDate",
"sortOrder": "descending",
}
url = base_url + urlencode(params)
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
root = ET.fromstring(response.text)
except Exception as e:
print(f"Error fetching from Arxiv: {e}")
return []
print(response)
ns = {"atom": "http://www.w3.org/2005/Atom"}
papers = []
for entry in root.findall("atom:entry", ns):
published = entry.find("atom:published", ns).text
published_dt = datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ")
# Filter by cutoff
if published_dt < (datetime.today() - timedelta(days=cutoff)):
continue
link = entry.find("atom:id", ns).text
pdf_link = link.replace("/abs/", "/pdf/")
title = entry.find("atom:title", ns).text.strip()
summary = entry.find("atom:summary", ns).text.strip()
authors = [author.find("atom:name", ns).text for author in entry.findall("atom:author", ns)]
papers.append({
"title": title,
"link": pdf_link,
"date": published,
"datetime": published_dt.isoformat(),
"desc": summary,
"authors": authors,
"source": "arXiv",
"domain": "arxiv.org",
"media": "arxiv.org",
})
# "link": "http://arxiv.org/abs/2509.09656v1",
#https://arxiv.org/pdf/2509.09656v1
# Optional: save to JSON
with open("arxiv_output.json", "w") as f:
json.dump(papers, f, indent=2)
return papers
def get_rss_feed_articles(rss_url, query="AI Agents", cutoff=1):
"""
Parse RSS feeds for news articles (Completely free)
Example RSS feeds:
- BBC: http://feeds.bbci.co.uk/news/rss.xml
- Reuters: http://feeds.reuters.com/reuters/topNews
- AP News: https://rsshub.app/ap/topics/apf-topnews
"""
try:
import feedparser
feed = feedparser.parse(rss_url)
articles = []
time_cutoff = datetime.now() - timedelta(days=cutoff)
for entry in feed.entries:
# Simple keyword matching
if query.lower() in entry.title.lower() or query.lower() in entry.get('summary', '').lower():
try:
# Parse publication date
pub_date = datetime(*entry.published_parsed[:6])
if pub_date >= time_cutoff:
domain = urlparse(entry.link).netloc
articles.append({
'title': entry.title,
'link': entry.link,
'date': pub_date.strftime('%Y-%m-%d'),
'datetime': pub_date.isoformat(),
'desc': entry.get('summary', '')[:200] + '...' if len(entry.get('summary', '')) > 200 else entry.get('summary', ''),
'domain': domain,
'source': feed.feed.get('title', 'RSS Feed')
})
except:
continue
return articles
except ImportError:
print("RSS parsing requires feedparser: pip install feedparser")
return []
except Exception as e:
print(f"Error parsing RSS feed: {e}")
return []
# Example usage function that mirrors your original structure
def get_news_articles(query="AI Agents", cutoff_days=1, api_choice="newsapi", api_key=None):
"""
Main function to get news articles from various sources
Args:
query: Search term
cutoff_days: How many days back to search
api_choice: 'newsapi', 'guardian', 'currents', 'gnews', or 'rss'
api_key: API key if required
"""
if api_choice == "newsapi":
news_articles = get_newsapi_articles(query, cutoff_days, api_key)
elif api_choice == "arxiv":
news_articles = get_arxiv_papers(query, 90, 10)
elif api_choice == "gnews":
news_articles = get_gnews_articles(query, cutoff_days)
elif api_choice == "rss":
# Example with BBC RSS feed
rss_url = "http://feeds.bbci.co.uk/news/technology/rss.xml"
news_articles = get_rss_feed_articles(rss_url, query, cutoff_days)
else:
print("Invalid API choice")
return [], pd.DataFrame()
if not news_articles:
return "No news articles found for the given query and time period.", pd.DataFrame()
# Create DataFrame for display (matching your original structure)
display_data = []
for i, article in enumerate(news_articles):
display_data.append({
'Index': i,
'Title': article['title'],
'Link': article['link'],
'Date': article['date'],
'Description': article['desc'][:100] + "..." if len(article['desc']) > 100 else article['desc'],
'Domain': article['domain']
})
return news_articles, pd.DataFrame(display_data)