Untitled

import feedparser
from newspaper import Article
import sqlite3
from datetime import datetime
from time import sleep

# Mega-complete RSS feed list — all discussed so far
rss_feeds = {
    # Original & earlier feeds
    "Reuters": "http://feeds.reuters.com/reuters/topNews",
    "NPR": "https://www.npr.org/rss/rss.php?id=1001",
    "CNN": "http://rss.cnn.com/rss/cnn_topstories.rss",
    "NYTimes": "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml",
    "The Guardian": "https://www.theguardian.com/world/rss",
    "Ars Technica": "https://feeds.arstechnica.com/arstechnica/index",
    "Wired": "https://www.wired.com/feed/rss",
    "TechCrunch": "https://techcrunch.com/feed/",
    "Al Jazeera": "https://www.aljazeera.com/xml/rss/all.xml",
    "BBC": "http://feeds.bbci.co.uk/news/rss.xml",
    "AP Top Headlines": "https://apnews.com/apf-topnews.rss",
    "Bloomberg Site": "https://www.bloomberg.com/feeds/site.xml",
    "NPR Politics": "https://www.npr.org/rss/rss.php?id=1014",
    "Washington Post National": "http://feeds.washingtonpost.com/rss/national",
    "MIT Technology Review": "https://www.technologyreview.com/feed/",
    "Scientific American": "https://www.scientificamerican.com/feed/rss/",
    "Nature News": "https://www.nature.com/nature/articles?type=news&format=rss",
    "NASA Breaking News": "https://www.nasa.gov/rss/dyn/breaking_news.rss",
    "Financial Times": "https://www.ft.com/?format=rss",
    "CNBC Top News": "https://www.cnbc.com/id/100003114/device/rss/rss.html",
    "MarketWatch Top Stories": "https://www.marketwatch.com/rss/topstories",
    "Politico Latest": "https://www.politico.com/rss/politics08.xml",
    "The Hill": "https://thehill.com/rss/syndicator/19110",
    "Inside Climate News": "https://insideclimatenews.org/feed/",
    "Climate Central": "https://www.climatecentral.org/news/feed",
    "ProPublica": "https://www.propublica.org/feeds/",
    "Reuters Science": "http://feeds.reuters.com/reuters/scienceNews",
    "USA Today": "https://rssfeeds.usatoday.com/usatoday-NewsTopStories",
    "CBS News": "https://www.cbsnews.com/latest/rss/main",
    "NBC News": "https://feeds.nbcnews.com/nbcnews/public/news",
    "ABC News": "https://abcnews.go.com/abcnews/topstories",
    "Engadget": "https://www.engadget.com/rss.xml",
    "Gizmodo": "https://gizmodo.com/rss",
    "Space.com": "https://www.space.com/feeds/all",
    "New Scientist": "https://www.newscientist.com/feed/home/",
    "Forbes": "https://www.forbes.com/real-time/feed2/",
    "Reuters Business": "http://feeds.reuters.com/reuters/businessNews",
    "BBC Politics": "http://feeds.bbci.co.uk/news/politics/rss.xml",
    "Vox Politics": "https://www.vox.com/rss/policy/index.xml",
    "Rolling Stone": "https://www.rollingstone.com/feed/",
    "Variety": "https://variety.com/feed/",
    "The Economist": "https://www.economist.com/the-world-this-week/rss.xml",
    "Wall Street Journal": "https://feeds.a.dj.com/rss/RSSWorldNews.xml",
    "France 24": "https://www.france24.com/en/rss",
    "Deutsche Welle": "https://rss.dw.com/xml/rss-en-world",
    "IEEE Spectrum": "https://spectrum.ieee.org/rss/fulltext",
    "ScienceDaily": "https://www.sciencedaily.com/rss/top/science.xml",
    "Live Science": "https://www.livescience.com/feeds/all",
    "Business Insider": "https://www.businessinsider.com/rss",
    "Investopedia News": "https://www.investopedia.com/feedbuilder/feed/getfeed/?feedName=news",
    "Brookings Institution": "https://www.brookings.edu/feed/",
    "Environmental News Network": "https://www.enn.com/rss",
    "Earth Island Journal": "https://earthisland.org/journal/feed",
    "Hollywood Reporter": "https://www.hollywoodreporter.com/t/feed/",
    "Pitchfork": "https://pitchfork.com/rss/news/",
    "Le Monde (French)": "https://www.lemonde.fr/rss/une.xml",
    "Der Spiegel (German)": "https://www.spiegel.de/international/index.rss",
    "NHK World (Japanese)": "https://www3.nhk.or.jp/nhkworld/en/news/rss.xml",
    "RT News (Russian)": "https://www.rt.com/rss/news/",

    # Newly added extra feeds
    "Chicago Tribune": "https://www.chicagotribune.com/arcio/rss/category/news/",
    "Los Angeles Times": "https://www.latimes.com/local/rss2.0.xml",
    "The Atlantic": "https://www.theatlantic.com/feed/all/",
    "BuzzFeed News": "https://www.buzzfeednews.com/feed",
    "KQED": "https://www.kqed.org/rss/news",
    "South China Morning Post": "https://www.scmp.com/rss/91/feed",
    "Japan Times": "https://www.japantimes.co.jp/feed/",
    "The Hindu": "https://www.thehindu.com/news/national/feeder/default.rss",
    "Seeking Alpha": "https://seekingalpha.com/feed.xml",
    "TheStreet": "https://www.thestreet.com/rss/frontpage",
    "Zero Hedge": "https://www.zerohedge.com/fullrss",
    "SCOTUSblog": "https://www.scotusblog.com/feed/",
    "Law360": "https://www.law360.com/rss/news",
    "Smithsonian Magazine": "https://www.smithsonianmag.com/feed/rss",
    "Artforum": "https://www.artforum.com/feed",
    "Medscape": "https://www.medscape.com/rss",
    "CDC Newsroom": "https://tools.cdc.gov/api/v2/resources/media/405952.rss",
}

# SQLite DB setup
print("📦 Initializing database: news_articles.db")
conn = sqlite3.connect("news_articles.db")
cur = conn.cursor()
cur.execute('''
CREATE TABLE IF NOT EXISTS articles (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    title TEXT,
    source TEXT,
    url TEXT UNIQUE,
    published TEXT,
    scraped_at TEXT,
    content TEXT
)
''')
conn.commit()

def extract_article(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.title, article.text
    except Exception as e:
        return None, f"[ERROR] {e}"

# Main scraping loop
for source, feed_url in rss_feeds.items():
    print(f"\n📡 FETCHING FROM: {source}")
    print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
    feed = feedparser.parse(feed_url)

    for entry in feed.entries[:5]:  # Limit per feed to 5 articles for politeness
        url = entry.link
        published = entry.get("published", "")
        scraped_at = datetime.utcnow().isoformat()

        cur.execute("SELECT 1 FROM articles WHERE url = ?", (url,))
        if cur.fetchone():
            print(f"🔁 Skipping duplicate: {url}")
            continue

        title, content = extract_article(url)
        if content and not content.startswith("[ERROR]"):
            cur.execute('''
                INSERT INTO articles (title, source, url, published, scraped_at, content)
                VALUES (?, ?, ?, ?, ?, ?)
            ''', (title, source, url, published, scraped_at, content))
            conn.commit()
            print(f"✅ Saved: {title[:60]}...")
            print(f"🔗 {url}\n")
        else:
            print(f"⚠️ Failed to extract: {url}")

        sleep(1)  # Polite pause to avoid hammering servers

conn.close()
print("\n🎉 All done! Articles saved to 'news_articles.db'")