Spaces:

jsemrau
/

Postwriter

Runtime error

File size: 14,871 Bytes

9e5dff8


import gradio as gr
import time
from datetime import date, timedelta, timezone, datetime
import os 
import pandas as pd
import numpy as np
import logging
import requests
from bs4 import BeautifulSoup
import urllib.parse
import dateutil.parser
from dateutil import parser as dateutil_parser
from tldextract import extract
from urllib.parse import quote_plus

from collections import defaultdict
from dotenv import load_dotenv

from GoogleNews import GoogleNews
import feedparser
   

# Your existing functions (unchanged)
def get_google_news(query="AI Agents", cutoff=1):
    """Get Google News articles based on query"""
    days = cutoff
    language = 'en'
    to_day = datetime.today().strftime('%m/%d/%Y')
    from_day = (datetime.today() - timedelta(days=days)).strftime('%m/%d/%Y')
    blackList=' -site:winbuzzer.com  -site:x.com -site:threads.com -site:instagram.com -site:linkedin.com -site:facebook.com -site:tiktok.com -site:reddit.com -site:youtube.com -site:newser.com -site:adexchanger.com -india -crypto -blockchain -bitcoin -DeFi'
    tQuery=query+blackList
    str_div = []
    
    print(f"Assembling news with cutoff {cutoff} for query: {str(tQuery)} ")
    try:
        googlenews = GoogleNews(start=from_day, end=to_day, lang=language)

        googlenews.search(tQuery)
        
        page1 = googlenews.result()
        df = pd.DataFrame(page1)

        time_cutoff = datetime.now() - timedelta(days=cutoff)

        for index, row in df.iterrows():
            
            try:
                news_time = dateutil.parser.parse(str(row['datetime']))
                if news_time >= time_cutoff:
                    domain = extract(row['link']).domain
                    str_a = row.to_dict()
                    str_a['datetime'] = str(news_time)
                    str_a.update({'domain': domain})
                    str_div.append(str_a)

                else:
                    print(f" Skipping {news_time} > {time_cutoff}")
            except Exception as inner_e:
                print(f"Error parsing datetime for row {index}: {inner_e}")
                continue

    except Exception as e:
        print("Error aggregating news " + str(e))

    return str_div

def resolve_redirect(url):
    try:
        response = requests.head(url, allow_redirects=True, timeout=5)
        return response.url
    except Exception as e:
        print(f"Redirect failed: {e}")
        return url
    
def get_google_news_new(query="AI Agents", cutoff=1):
    """Get Google News articles based on query using RSS feed, output similar to GoogleNews package"""
    results = []

    print("Assembling news for " + str(query))

    try:
        # Create RSS URL with proper encoding
        encoded_query = query.replace(' ', '+')
        url = f"https://news.google.com/rss/search?q={encoded_query}"

        # Parse the RSS feed
        feed = feedparser.parse(url)

        # Set time cutoff
        time_cutoff = datetime.now(timezone.utc) - timedelta(days=cutoff)

        for entry in feed.entries:
            try:
                # Parse the published or updated date
                if hasattr(entry, 'published'):
                    news_time = dateutil_parser.parse(entry.published)
                elif hasattr(entry, 'updated'):
                    news_time = dateutil_parser.parse(entry.updated)
                else:
                    continue

                # Skip old articles
                if news_time < time_cutoff:
                    continue

                # Resolve final article URL
                final_url = resolve_redirect(entry.link) if hasattr(entry, 'link') else ''

                # Estimate relative time (like '3 hours ago')
                time_diff = datetime.now(timezone.utc) - news_time
                if time_diff.days > 0:
                    relative_date = f"{time_diff.days} days ago"
                elif time_diff.seconds >= 3600:
                    relative_date = f"{time_diff.seconds // 3600} hours ago"
                else:
                    relative_date = f"{time_diff.seconds // 60} minutes ago"

                # Extract domain for media name
                domain_parts = extract(final_url)
                media = domain_parts.domain.capitalize() if domain_parts.domain else "Unknown"

                # Build result dict
                article_dict = {
                    'title': entry.title if hasattr(entry, 'title') else '',
                    'media': media,
                    'domain': media,
                    'date': relative_date,
                    'datetime': news_time,
                    'link': final_url,
                    'desc': entry.summary if hasattr(entry, 'summary') else '',
                    'img': getattr(entry, 'media_content', [{}])[0].get('url', '') if hasattr(entry, 'media_content') else ''
                }

                print(f"{article_dict}\n")

                results.append(article_dict)

            except Exception as inner_e:
                print(f"Error parsing entry: {inner_e}")
                continue

    except Exception as e:
        print("Error aggregating news " + str(e))

    print(f"I found {len(results)} items.")
    return results

import requests
import pandas as pd
from datetime import datetime, timedelta
from urllib.parse import urlparse
import time

def get_newsapi_articles(query="AI Agents", cutoff=1, api_key=None):
    """
    Get news articles from NewsAPI.org (Free tier: 1000 requests/month)
    Sign up at: https://newsapi.org/
    """
    if not api_key:
        print("NewsAPI requires an API key. Sign up at https://newsapi.org/")
        return []
    
    days = cutoff
    from_date = (datetime.today() - timedelta(days=days)).strftime('%Y-%m-%d')
    
    newsapi_key=os.getenv('NEWSAPI')

    url = "https://newsapi.org/v2/everything"
    params = {
        'q': query,
        'from': from_date,
        'sortBy': 'publishedAt',
        'language': 'en',
        'apiKey': newsapi_key,
        'pageSize': 50
    }
    
    try:
        response = requests.get(url, params=params)
        print(response)
        response.raise_for_status()
        data = response.json()
        
        articles = []
        for article in data.get('articles', []):
            domain = urlparse(article['url']).netloc
            articles.append({
                'title': article['title'],
                'link': article['url'],
                'date': article['publishedAt'][:10],
                'datetime': article['publishedAt'],
                'desc': article['description'] or '',
                'domain': domain,
                'source': article['source']['name']
            })
        
        return articles
    except Exception as e:
        print(f"Error fetching from NewsAPI: {e}")
        return []
    
def get_gnews_articles(query="AI Agents", cutoff=1):
    """
    Get news articles from GNews (No API key required, but has rate limits)
    Completely free but limited to 100 requests per day
    """
    import json

    days = cutoff
    api_key=os.environ['GNEWSAPI']
    from_date = (datetime.today() - timedelta(days=days)).strftime('%Y-%m-%d')
    
    url = "https://gnews.io/api/v4/search"

    #https://gnews.io/api/v4/search?q=Google&lang=en&max=5&apikey=YOUR_API_KEY

    params = {
        'q': query,
        'apikey':api_key,
        'lang': 'en',
        'max': 25,
        'from': from_date + 'T00:00:00Z',
        'to': datetime.today().strftime('%Y-%m-%d') + 'T23:59:59Z',
    }
    
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        
        with open('data_output.json', 'w') as f:
            json.dump(response.json(), f, indent=2)

    except Exception as e:
        print(f"Error fetching from GNews: {e}")
        return []

        
    articles = data.get('articles', [])

    rArticles=[]

    for article in articles:

            #try:
            
            link= article.get('url', "")
            domain = urlparse(link).netloc

            rArticles.append({
                'title': article['title'],
                'link':  article.get('url', ""),
                'date': article.get('publishedAt', ""),
                'datetime': article.get('publishedAt', ""),
                'desc':  article.get('description', ""),
                'domain': domain,
                'media': domain,
                'source': article['source']['name'],
            })
        
        
            #except Exception as e:
            #    print(f"Error preparing from GNews: {e}")
            #    continue

    return rArticles

def get_arxiv_papers(query="AI Agents", cutoff=7, max_results=25):
    """
    Get recent papers from Arxiv for a given keyword.
    Uses the Arxiv API (no API key required).
    
    Args:
        query (str): Search keyword(s).
        cutoff (int): How many days back to search.
        max_results (int): Maximum number of results to return.
    
    Returns:
        list of dicts with paper metadata.
    """
    import json
    import requests
    from datetime import datetime, timedelta
    from urllib.parse import urlencode
    import xml.etree.ElementTree as ET

    # Calculate date range
    from_date = (datetime.today() - timedelta(days=cutoff)).strftime('%Y%m%d%H%M%S')
    to_date = datetime.today().strftime('%Y%m%d%H%M%S')

    # Arxiv API endpoint
    base_url = "http://export.arxiv.org/api/query?"

    if not isinstance(max_results, int) or max_results <= 0:
            max_results = 25  # fallback to safe default


    # Construct query (Arxiv search syntax: all:keyword)
    search_query = f"all:{query}"

    params = {
        "search_query": search_query,
        "start": 0,
        "max_results": max_results,
        "sortBy": "submittedDate",
        "sortOrder": "descending",
    }

    url = base_url + urlencode(params)

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        root = ET.fromstring(response.text)
    except Exception as e:
        print(f"Error fetching from Arxiv: {e}")
        return []

    print(response)

    ns = {"atom": "http://www.w3.org/2005/Atom"}

    papers = []
    for entry in root.findall("atom:entry", ns):
        published = entry.find("atom:published", ns).text
        published_dt = datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ")

        # Filter by cutoff
        if published_dt < (datetime.today() - timedelta(days=cutoff)):
            continue

        link = entry.find("atom:id", ns).text
        pdf_link = link.replace("/abs/", "/pdf/")

        title = entry.find("atom:title", ns).text.strip()
        summary = entry.find("atom:summary", ns).text.strip()
        authors = [author.find("atom:name", ns).text for author in entry.findall("atom:author", ns)]

        papers.append({
            "title": title,
            "link": pdf_link,
            "date": published,
            "datetime": published_dt.isoformat(),
            "desc": summary,
            "authors": authors,
            "source": "arXiv",
            "domain": "arxiv.org",
            "media": "arxiv.org",
        })

        # "link": "http://arxiv.org/abs/2509.09656v1",
        #https://arxiv.org/pdf/2509.09656v1

    # Optional: save to JSON
    with open("arxiv_output.json", "w") as f:
        json.dump(papers, f, indent=2)

    return papers

def get_rss_feed_articles(rss_url, query="AI Agents", cutoff=1):
    """
    Parse RSS feeds for news articles (Completely free)
    Example RSS feeds:
    - BBC: http://feeds.bbci.co.uk/news/rss.xml
    - Reuters: http://feeds.reuters.com/reuters/topNews
    - AP News: https://rsshub.app/ap/topics/apf-topnews
    """
    try:
        import feedparser
        
        feed = feedparser.parse(rss_url)
        articles = []
        time_cutoff = datetime.now() - timedelta(days=cutoff)
        
        for entry in feed.entries:
            # Simple keyword matching
            if query.lower() in entry.title.lower() or query.lower() in entry.get('summary', '').lower():
                try:
                    # Parse publication date
                    pub_date = datetime(*entry.published_parsed[:6])
                    if pub_date >= time_cutoff:
                        domain = urlparse(entry.link).netloc
                        articles.append({
                            'title': entry.title,
                            'link': entry.link,
                            'date': pub_date.strftime('%Y-%m-%d'),
                            'datetime': pub_date.isoformat(),
                            'desc': entry.get('summary', '')[:200] + '...' if len(entry.get('summary', '')) > 200 else entry.get('summary', ''),
                            'domain': domain,
                            'source': feed.feed.get('title', 'RSS Feed')
                        })
                except:
                    continue
        
        return articles
    except ImportError:
        print("RSS parsing requires feedparser: pip install feedparser")
        return []
    except Exception as e:
        print(f"Error parsing RSS feed: {e}")
        return []

# Example usage function that mirrors your original structure
def get_news_articles(query="AI Agents", cutoff_days=1, api_choice="newsapi", api_key=None):
    """
    Main function to get news articles from various sources
    
    Args:
        query: Search term
        cutoff_days: How many days back to search
        api_choice: 'newsapi', 'guardian', 'currents', 'gnews', or 'rss'
        api_key: API key if required
    """
    
    if api_choice == "newsapi":
        news_articles = get_newsapi_articles(query, cutoff_days, api_key)
    elif api_choice == "arxiv":
        news_articles = get_arxiv_papers(query, 90, 10)
    elif api_choice == "gnews":
        news_articles = get_gnews_articles(query, cutoff_days)
    elif api_choice == "rss":
        # Example with BBC RSS feed
        rss_url = "http://feeds.bbci.co.uk/news/technology/rss.xml"
        news_articles = get_rss_feed_articles(rss_url, query, cutoff_days)
    else:
        print("Invalid API choice")
        return [], pd.DataFrame()
    
    if not news_articles:
        return "No news articles found for the given query and time period.", pd.DataFrame()
    
    # Create DataFrame for display (matching your original structure)
    display_data = []
    for i, article in enumerate(news_articles):
        display_data.append({
            'Index': i,
            'Title': article['title'],
            'Link': article['link'],
            'Date': article['date'],
            'Description': article['desc'][:100] + "..." if len(article['desc']) > 100 else article['desc'],
            'Domain': article['domain']
        })
    
    return news_articles, pd.DataFrame(display_data)