File size: 14,871 Bytes
9e5dff8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445

import gradio as gr
import time
from datetime import date, timedelta, timezone, datetime
import os 
import pandas as pd
import numpy as np
import logging
import requests
from bs4 import BeautifulSoup
import urllib.parse
import dateutil.parser
from dateutil import parser as dateutil_parser
from tldextract import extract
from urllib.parse import quote_plus

from collections import defaultdict
from dotenv import load_dotenv

from GoogleNews import GoogleNews
import feedparser
   

# Your existing functions (unchanged)
def get_google_news(query="AI Agents", cutoff=1):
    """Get Google News articles based on query"""
    days = cutoff
    language = 'en'
    to_day = datetime.today().strftime('%m/%d/%Y')
    from_day = (datetime.today() - timedelta(days=days)).strftime('%m/%d/%Y')
    blackList=' -site:winbuzzer.com  -site:x.com -site:threads.com -site:instagram.com -site:linkedin.com -site:facebook.com -site:tiktok.com -site:reddit.com -site:youtube.com -site:newser.com -site:adexchanger.com -india -crypto -blockchain -bitcoin -DeFi'
    tQuery=query+blackList
    str_div = []
    
    print(f"Assembling news with cutoff {cutoff} for query: {str(tQuery)} ")
    try:
        googlenews = GoogleNews(start=from_day, end=to_day, lang=language)

        googlenews.search(tQuery)
        
        page1 = googlenews.result()
        df = pd.DataFrame(page1)

        time_cutoff = datetime.now() - timedelta(days=cutoff)

        for index, row in df.iterrows():
            
            try:
                news_time = dateutil.parser.parse(str(row['datetime']))
                if news_time >= time_cutoff:
                    domain = extract(row['link']).domain
                    str_a = row.to_dict()
                    str_a['datetime'] = str(news_time)
                    str_a.update({'domain': domain})
                    str_div.append(str_a)

                else:
                    print(f" Skipping {news_time} > {time_cutoff}")
            except Exception as inner_e:
                print(f"Error parsing datetime for row {index}: {inner_e}")
                continue

    except Exception as e:
        print("Error aggregating news " + str(e))

    return str_div

def resolve_redirect(url):
    try:
        response = requests.head(url, allow_redirects=True, timeout=5)
        return response.url
    except Exception as e:
        print(f"Redirect failed: {e}")
        return url
    
def get_google_news_new(query="AI Agents", cutoff=1):
    """Get Google News articles based on query using RSS feed, output similar to GoogleNews package"""
    results = []

    print("Assembling news for " + str(query))

    try:
        # Create RSS URL with proper encoding
        encoded_query = query.replace(' ', '+')
        url = f"https://news.google.com/rss/search?q={encoded_query}"

        # Parse the RSS feed
        feed = feedparser.parse(url)

        # Set time cutoff
        time_cutoff = datetime.now(timezone.utc) - timedelta(days=cutoff)

        for entry in feed.entries:
            try:
                # Parse the published or updated date
                if hasattr(entry, 'published'):
                    news_time = dateutil_parser.parse(entry.published)
                elif hasattr(entry, 'updated'):
                    news_time = dateutil_parser.parse(entry.updated)
                else:
                    continue

                # Skip old articles
                if news_time < time_cutoff:
                    continue

                # Resolve final article URL
                final_url = resolve_redirect(entry.link) if hasattr(entry, 'link') else ''

                # Estimate relative time (like '3 hours ago')
                time_diff = datetime.now(timezone.utc) - news_time
                if time_diff.days > 0:
                    relative_date = f"{time_diff.days} days ago"
                elif time_diff.seconds >= 3600:
                    relative_date = f"{time_diff.seconds // 3600} hours ago"
                else:
                    relative_date = f"{time_diff.seconds // 60} minutes ago"

                # Extract domain for media name
                domain_parts = extract(final_url)
                media = domain_parts.domain.capitalize() if domain_parts.domain else "Unknown"

                # Build result dict
                article_dict = {
                    'title': entry.title if hasattr(entry, 'title') else '',
                    'media': media,
                    'domain': media,
                    'date': relative_date,
                    'datetime': news_time,
                    'link': final_url,
                    'desc': entry.summary if hasattr(entry, 'summary') else '',
                    'img': getattr(entry, 'media_content', [{}])[0].get('url', '') if hasattr(entry, 'media_content') else ''
                }

                print(f"{article_dict}\n")

                results.append(article_dict)

            except Exception as inner_e:
                print(f"Error parsing entry: {inner_e}")
                continue

    except Exception as e:
        print("Error aggregating news " + str(e))

    print(f"I found {len(results)} items.")
    return results

import requests
import pandas as pd
from datetime import datetime, timedelta
from urllib.parse import urlparse
import time

def get_newsapi_articles(query="AI Agents", cutoff=1, api_key=None):
    """
    Get news articles from NewsAPI.org (Free tier: 1000 requests/month)
    Sign up at: https://newsapi.org/
    """
    if not api_key:
        print("NewsAPI requires an API key. Sign up at https://newsapi.org/")
        return []
    
    days = cutoff
    from_date = (datetime.today() - timedelta(days=days)).strftime('%Y-%m-%d')
    
    newsapi_key=os.getenv('NEWSAPI')

    url = "https://newsapi.org/v2/everything"
    params = {
        'q': query,
        'from': from_date,
        'sortBy': 'publishedAt',
        'language': 'en',
        'apiKey': newsapi_key,
        'pageSize': 50
    }
    
    try:
        response = requests.get(url, params=params)
        print(response)
        response.raise_for_status()
        data = response.json()
        
        articles = []
        for article in data.get('articles', []):
            domain = urlparse(article['url']).netloc
            articles.append({
                'title': article['title'],
                'link': article['url'],
                'date': article['publishedAt'][:10],
                'datetime': article['publishedAt'],
                'desc': article['description'] or '',
                'domain': domain,
                'source': article['source']['name']
            })
        
        return articles
    except Exception as e:
        print(f"Error fetching from NewsAPI: {e}")
        return []
    
def get_gnews_articles(query="AI Agents", cutoff=1):
    """
    Get news articles from GNews (No API key required, but has rate limits)
    Completely free but limited to 100 requests per day
    """
    import json

    days = cutoff
    api_key=os.environ['GNEWSAPI']
    from_date = (datetime.today() - timedelta(days=days)).strftime('%Y-%m-%d')
    
    url = "https://gnews.io/api/v4/search"

    #https://gnews.io/api/v4/search?q=Google&lang=en&max=5&apikey=YOUR_API_KEY

    params = {
        'q': query,
        'apikey':api_key,
        'lang': 'en',
        'max': 25,
        'from': from_date + 'T00:00:00Z',
        'to': datetime.today().strftime('%Y-%m-%d') + 'T23:59:59Z',
    }
    
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        
        with open('data_output.json', 'w') as f:
            json.dump(response.json(), f, indent=2)

    except Exception as e:
        print(f"Error fetching from GNews: {e}")
        return []

        
    articles = data.get('articles', [])

    rArticles=[]

    for article in articles:

            #try:
            
            link= article.get('url', "")
            domain = urlparse(link).netloc

            rArticles.append({
                'title': article['title'],
                'link':  article.get('url', ""),
                'date': article.get('publishedAt', ""),
                'datetime': article.get('publishedAt', ""),
                'desc':  article.get('description', ""),
                'domain': domain,
                'media': domain,
                'source': article['source']['name'],
            })
        
        
            #except Exception as e:
            #    print(f"Error preparing from GNews: {e}")
            #    continue

    return rArticles

def get_arxiv_papers(query="AI Agents", cutoff=7, max_results=25):
    """
    Get recent papers from Arxiv for a given keyword.
    Uses the Arxiv API (no API key required).
    
    Args:
        query (str): Search keyword(s).
        cutoff (int): How many days back to search.
        max_results (int): Maximum number of results to return.
    
    Returns:
        list of dicts with paper metadata.
    """
    import json
    import requests
    from datetime import datetime, timedelta
    from urllib.parse import urlencode
    import xml.etree.ElementTree as ET

    # Calculate date range
    from_date = (datetime.today() - timedelta(days=cutoff)).strftime('%Y%m%d%H%M%S')
    to_date = datetime.today().strftime('%Y%m%d%H%M%S')

    # Arxiv API endpoint
    base_url = "http://export.arxiv.org/api/query?"

    if not isinstance(max_results, int) or max_results <= 0:
            max_results = 25  # fallback to safe default


    # Construct query (Arxiv search syntax: all:keyword)
    search_query = f"all:{query}"

    params = {
        "search_query": search_query,
        "start": 0,
        "max_results": max_results,
        "sortBy": "submittedDate",
        "sortOrder": "descending",
    }

    url = base_url + urlencode(params)

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        root = ET.fromstring(response.text)
    except Exception as e:
        print(f"Error fetching from Arxiv: {e}")
        return []

    print(response)

    ns = {"atom": "http://www.w3.org/2005/Atom"}

    papers = []
    for entry in root.findall("atom:entry", ns):
        published = entry.find("atom:published", ns).text
        published_dt = datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ")

        # Filter by cutoff
        if published_dt < (datetime.today() - timedelta(days=cutoff)):
            continue

        link = entry.find("atom:id", ns).text
        pdf_link = link.replace("/abs/", "/pdf/")

        title = entry.find("atom:title", ns).text.strip()
        summary = entry.find("atom:summary", ns).text.strip()
        authors = [author.find("atom:name", ns).text for author in entry.findall("atom:author", ns)]

        papers.append({
            "title": title,
            "link": pdf_link,
            "date": published,
            "datetime": published_dt.isoformat(),
            "desc": summary,
            "authors": authors,
            "source": "arXiv",
            "domain": "arxiv.org",
            "media": "arxiv.org",
        })

        # "link": "http://arxiv.org/abs/2509.09656v1",
        #https://arxiv.org/pdf/2509.09656v1

    # Optional: save to JSON
    with open("arxiv_output.json", "w") as f:
        json.dump(papers, f, indent=2)

    return papers

def get_rss_feed_articles(rss_url, query="AI Agents", cutoff=1):
    """
    Parse RSS feeds for news articles (Completely free)
    Example RSS feeds:
    - BBC: http://feeds.bbci.co.uk/news/rss.xml
    - Reuters: http://feeds.reuters.com/reuters/topNews
    - AP News: https://rsshub.app/ap/topics/apf-topnews
    """
    try:
        import feedparser
        
        feed = feedparser.parse(rss_url)
        articles = []
        time_cutoff = datetime.now() - timedelta(days=cutoff)
        
        for entry in feed.entries:
            # Simple keyword matching
            if query.lower() in entry.title.lower() or query.lower() in entry.get('summary', '').lower():
                try:
                    # Parse publication date
                    pub_date = datetime(*entry.published_parsed[:6])
                    if pub_date >= time_cutoff:
                        domain = urlparse(entry.link).netloc
                        articles.append({
                            'title': entry.title,
                            'link': entry.link,
                            'date': pub_date.strftime('%Y-%m-%d'),
                            'datetime': pub_date.isoformat(),
                            'desc': entry.get('summary', '')[:200] + '...' if len(entry.get('summary', '')) > 200 else entry.get('summary', ''),
                            'domain': domain,
                            'source': feed.feed.get('title', 'RSS Feed')
                        })
                except:
                    continue
        
        return articles
    except ImportError:
        print("RSS parsing requires feedparser: pip install feedparser")
        return []
    except Exception as e:
        print(f"Error parsing RSS feed: {e}")
        return []

# Example usage function that mirrors your original structure
def get_news_articles(query="AI Agents", cutoff_days=1, api_choice="newsapi", api_key=None):
    """
    Main function to get news articles from various sources
    
    Args:
        query: Search term
        cutoff_days: How many days back to search
        api_choice: 'newsapi', 'guardian', 'currents', 'gnews', or 'rss'
        api_key: API key if required
    """
    
    if api_choice == "newsapi":
        news_articles = get_newsapi_articles(query, cutoff_days, api_key)
    elif api_choice == "arxiv":
        news_articles = get_arxiv_papers(query, 90, 10)
    elif api_choice == "gnews":
        news_articles = get_gnews_articles(query, cutoff_days)
    elif api_choice == "rss":
        # Example with BBC RSS feed
        rss_url = "http://feeds.bbci.co.uk/news/technology/rss.xml"
        news_articles = get_rss_feed_articles(rss_url, query, cutoff_days)
    else:
        print("Invalid API choice")
        return [], pd.DataFrame()
    
    if not news_articles:
        return "No news articles found for the given query and time period.", pd.DataFrame()
    
    # Create DataFrame for display (matching your original structure)
    display_data = []
    for i, article in enumerate(news_articles):
        display_data.append({
            'Index': i,
            'Title': article['title'],
            'Link': article['link'],
            'Date': article['date'],
            'Description': article['desc'][:100] + "..." if len(article['desc']) > 100 else article['desc'],
            'Domain': article['domain']
        })
    
    return news_articles, pd.DataFrame(display_data)