Spaces:

jsemrau
/

Postwriter

Runtime error

App Files Files Community

Postwriter / connector.py

jsemrau

added connector

9e5dff8 3 months ago

raw

history blame

14.9 kB


	import gradio as gr
	import time
	from datetime import date, timedelta, timezone, datetime
	import os
	import pandas as pd
	import numpy as np
	import logging
	import requests
	from bs4 import BeautifulSoup
	import urllib.parse
	import dateutil.parser
	from dateutil import parser as dateutil_parser
	from tldextract import extract
	from urllib.parse import quote_plus

	from collections import defaultdict
	from dotenv import load_dotenv

	from GoogleNews import GoogleNews
	import feedparser


	# Your existing functions (unchanged)
	def get_google_news(query="AI Agents", cutoff=1):
	"""Get Google News articles based on query"""
	days = cutoff
	language = 'en'
	to_day = datetime.today().strftime('%m/%d/%Y')
	from_day = (datetime.today() - timedelta(days=days)).strftime('%m/%d/%Y')
	blackList=' -site:winbuzzer.com -site:x.com -site:threads.com -site:instagram.com -site:linkedin.com -site:facebook.com -site:tiktok.com -site:reddit.com -site:youtube.com -site:newser.com -site:adexchanger.com -india -crypto -blockchain -bitcoin -DeFi'
	tQuery=query+blackList
	str_div = []

	print(f"Assembling news with cutoff {cutoff} for query: {str(tQuery)} ")
	try:
	googlenews = GoogleNews(start=from_day, end=to_day, lang=language)

	googlenews.search(tQuery)

	page1 = googlenews.result()
	df = pd.DataFrame(page1)

	time_cutoff = datetime.now() - timedelta(days=cutoff)

	for index, row in df.iterrows():

	try:
	news_time = dateutil.parser.parse(str(row['datetime']))
	if news_time >= time_cutoff:
	domain = extract(row['link']).domain
	str_a = row.to_dict()
	str_a['datetime'] = str(news_time)
	str_a.update({'domain': domain})
	str_div.append(str_a)

	else:
	print(f" Skipping {news_time} > {time_cutoff}")
	except Exception as inner_e:
	print(f"Error parsing datetime for row {index}: {inner_e}")
	continue

	except Exception as e:
	print("Error aggregating news " + str(e))

	return str_div

	def resolve_redirect(url):
	try:
	response = requests.head(url, allow_redirects=True, timeout=5)
	return response.url
	except Exception as e:
	print(f"Redirect failed: {e}")
	return url

	def get_google_news_new(query="AI Agents", cutoff=1):
	"""Get Google News articles based on query using RSS feed, output similar to GoogleNews package"""
	results = []

	print("Assembling news for " + str(query))

	try:
	# Create RSS URL with proper encoding
	encoded_query = query.replace(' ', '+')
	url = f"https://news.google.com/rss/search?q={encoded_query}"

	# Parse the RSS feed
	feed = feedparser.parse(url)

	# Set time cutoff
	time_cutoff = datetime.now(timezone.utc) - timedelta(days=cutoff)

	for entry in feed.entries:
	try:
	# Parse the published or updated date
	if hasattr(entry, 'published'):
	news_time = dateutil_parser.parse(entry.published)
	elif hasattr(entry, 'updated'):
	news_time = dateutil_parser.parse(entry.updated)
	else:
	continue

	# Skip old articles
	if news_time < time_cutoff:
	continue

	# Resolve final article URL
	final_url = resolve_redirect(entry.link) if hasattr(entry, 'link') else ''

	# Estimate relative time (like '3 hours ago')
	time_diff = datetime.now(timezone.utc) - news_time
	if time_diff.days > 0:
	relative_date = f"{time_diff.days} days ago"
	elif time_diff.seconds >= 3600:
	relative_date = f"{time_diff.seconds // 3600} hours ago"
	else:
	relative_date = f"{time_diff.seconds // 60} minutes ago"

	# Extract domain for media name
	domain_parts = extract(final_url)
	media = domain_parts.domain.capitalize() if domain_parts.domain else "Unknown"

	# Build result dict
	article_dict = {
	'title': entry.title if hasattr(entry, 'title') else '',
	'media': media,
	'domain': media,
	'date': relative_date,
	'datetime': news_time,
	'link': final_url,
	'desc': entry.summary if hasattr(entry, 'summary') else '',
	'img': getattr(entry, 'media_content', [{}])[0].get('url', '') if hasattr(entry, 'media_content') else ''
	}

	print(f"{article_dict}\n")

	results.append(article_dict)

	except Exception as inner_e:
	print(f"Error parsing entry: {inner_e}")
	continue

	except Exception as e:
	print("Error aggregating news " + str(e))

	print(f"I found {len(results)} items.")
	return results

	import requests
	import pandas as pd
	from datetime import datetime, timedelta
	from urllib.parse import urlparse
	import time

	def get_newsapi_articles(query="AI Agents", cutoff=1, api_key=None):
	"""
	Get news articles from NewsAPI.org (Free tier: 1000 requests/month)
	Sign up at: https://newsapi.org/
	"""
	if not api_key:
	print("NewsAPI requires an API key. Sign up at https://newsapi.org/")
	return []

	days = cutoff
	from_date = (datetime.today() - timedelta(days=days)).strftime('%Y-%m-%d')

	newsapi_key=os.getenv('NEWSAPI')

	url = "https://newsapi.org/v2/everything"
	params = {
	'q': query,
	'from': from_date,
	'sortBy': 'publishedAt',
	'language': 'en',
	'apiKey': newsapi_key,
	'pageSize': 50
	}

	try:
	response = requests.get(url, params=params)
	print(response)
	response.raise_for_status()
	data = response.json()

	articles = []
	for article in data.get('articles', []):
	domain = urlparse(article['url']).netloc
	articles.append({
	'title': article['title'],
	'link': article['url'],
	'date': article['publishedAt'][:10],
	'datetime': article['publishedAt'],
	'desc': article['description'] or '',
	'domain': domain,
	'source': article['source']['name']
	})

	return articles
	except Exception as e:
	print(f"Error fetching from NewsAPI: {e}")
	return []

	def get_gnews_articles(query="AI Agents", cutoff=1):
	"""
	Get news articles from GNews (No API key required, but has rate limits)
	Completely free but limited to 100 requests per day
	"""
	import json

	days = cutoff
	api_key=os.environ['GNEWSAPI']
	from_date = (datetime.today() - timedelta(days=days)).strftime('%Y-%m-%d')

	url = "https://gnews.io/api/v4/search"

	#https://gnews.io/api/v4/search?q=Google&lang=en&max=5&apikey=YOUR_API_KEY

	params = {
	'q': query,
	'apikey':api_key,
	'lang': 'en',
	'max': 25,
	'from': from_date + 'T00:00:00Z',
	'to': datetime.today().strftime('%Y-%m-%d') + 'T23:59:59Z',
	}

	try:
	response = requests.get(url, params=params)
	response.raise_for_status()
	data = response.json()

	with open('data_output.json', 'w') as f:
	json.dump(response.json(), f, indent=2)

	except Exception as e:
	print(f"Error fetching from GNews: {e}")
	return []


	articles = data.get('articles', [])

	rArticles=[]

	for article in articles:

	#try:

	link= article.get('url', "")
	domain = urlparse(link).netloc

	rArticles.append({
	'title': article['title'],
	'link': article.get('url', ""),
	'date': article.get('publishedAt', ""),
	'datetime': article.get('publishedAt', ""),
	'desc': article.get('description', ""),
	'domain': domain,
	'media': domain,
	'source': article['source']['name'],
	})


	#except Exception as e:
	# print(f"Error preparing from GNews: {e}")
	# continue

	return rArticles

	def get_arxiv_papers(query="AI Agents", cutoff=7, max_results=25):
	"""
	Get recent papers from Arxiv for a given keyword.
	Uses the Arxiv API (no API key required).

	Args:
	query (str): Search keyword(s).
	cutoff (int): How many days back to search.
	max_results (int): Maximum number of results to return.

	Returns:
	list of dicts with paper metadata.
	"""
	import json
	import requests
	from datetime import datetime, timedelta
	from urllib.parse import urlencode
	import xml.etree.ElementTree as ET

	# Calculate date range
	from_date = (datetime.today() - timedelta(days=cutoff)).strftime('%Y%m%d%H%M%S')
	to_date = datetime.today().strftime('%Y%m%d%H%M%S')

	# Arxiv API endpoint
	base_url = "http://export.arxiv.org/api/query?"

	if not isinstance(max_results, int) or max_results <= 0:
	max_results = 25 # fallback to safe default


	# Construct query (Arxiv search syntax: all:keyword)
	search_query = f"all:{query}"

	params = {
	"search_query": search_query,
	"start": 0,
	"max_results": max_results,
	"sortBy": "submittedDate",
	"sortOrder": "descending",
	}

	url = base_url + urlencode(params)

	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	root = ET.fromstring(response.text)
	except Exception as e:
	print(f"Error fetching from Arxiv: {e}")
	return []

	print(response)

	ns = {"atom": "http://www.w3.org/2005/Atom"}

	papers = []
	for entry in root.findall("atom:entry", ns):
	published = entry.find("atom:published", ns).text
	published_dt = datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ")

	# Filter by cutoff
	if published_dt < (datetime.today() - timedelta(days=cutoff)):
	continue

	link = entry.find("atom:id", ns).text
	pdf_link = link.replace("/abs/", "/pdf/")

	title = entry.find("atom:title", ns).text.strip()
	summary = entry.find("atom:summary", ns).text.strip()
	authors = [author.find("atom:name", ns).text for author in entry.findall("atom:author", ns)]

	papers.append({
	"title": title,
	"link": pdf_link,
	"date": published,
	"datetime": published_dt.isoformat(),
	"desc": summary,
	"authors": authors,
	"source": "arXiv",
	"domain": "arxiv.org",
	"media": "arxiv.org",
	})

	# "link": "http://arxiv.org/abs/2509.09656v1",
	#https://arxiv.org/pdf/2509.09656v1

	# Optional: save to JSON
	with open("arxiv_output.json", "w") as f:
	json.dump(papers, f, indent=2)

	return papers

	def get_rss_feed_articles(rss_url, query="AI Agents", cutoff=1):
	"""
	Parse RSS feeds for news articles (Completely free)
	Example RSS feeds:
	- BBC: http://feeds.bbci.co.uk/news/rss.xml
	- Reuters: http://feeds.reuters.com/reuters/topNews
	- AP News: https://rsshub.app/ap/topics/apf-topnews
	"""
	try:
	import feedparser

	feed = feedparser.parse(rss_url)
	articles = []
	time_cutoff = datetime.now() - timedelta(days=cutoff)

	for entry in feed.entries:
	# Simple keyword matching
	if query.lower() in entry.title.lower() or query.lower() in entry.get('summary', '').lower():
	try:
	# Parse publication date
	pub_date = datetime(*entry.published_parsed[:6])
	if pub_date >= time_cutoff:
	domain = urlparse(entry.link).netloc
	articles.append({
	'title': entry.title,
	'link': entry.link,
	'date': pub_date.strftime('%Y-%m-%d'),
	'datetime': pub_date.isoformat(),
	'desc': entry.get('summary', '')[:200] + '...' if len(entry.get('summary', '')) > 200 else entry.get('summary', ''),
	'domain': domain,
	'source': feed.feed.get('title', 'RSS Feed')
	})
	except:
	continue

	return articles
	except ImportError:
	print("RSS parsing requires feedparser: pip install feedparser")
	return []
	except Exception as e:
	print(f"Error parsing RSS feed: {e}")
	return []

	# Example usage function that mirrors your original structure
	def get_news_articles(query="AI Agents", cutoff_days=1, api_choice="newsapi", api_key=None):
	"""
	Main function to get news articles from various sources

	Args:
	query: Search term
	cutoff_days: How many days back to search
	api_choice: 'newsapi', 'guardian', 'currents', 'gnews', or 'rss'
	api_key: API key if required
	"""

	if api_choice == "newsapi":
	news_articles = get_newsapi_articles(query, cutoff_days, api_key)
	elif api_choice == "arxiv":
	news_articles = get_arxiv_papers(query, 90, 10)
	elif api_choice == "gnews":
	news_articles = get_gnews_articles(query, cutoff_days)
	elif api_choice == "rss":
	# Example with BBC RSS feed
	rss_url = "http://feeds.bbci.co.uk/news/technology/rss.xml"
	news_articles = get_rss_feed_articles(rss_url, query, cutoff_days)
	else:
	print("Invalid API choice")
	return [], pd.DataFrame()

	if not news_articles:
	return "No news articles found for the given query and time period.", pd.DataFrame()

	# Create DataFrame for display (matching your original structure)
	display_data = []
	for i, article in enumerate(news_articles):
	display_data.append({
	'Index': i,
	'Title': article['title'],
	'Link': article['link'],
	'Date': article['date'],
	'Description': article['desc'][:100] + "..." if len(article['desc']) > 100 else article['desc'],
	'Domain': article['domain']
	})

	return news_articles, pd.DataFrame(display_data)