Guest User

Untitled

a guest
Jul 12th, 2025
68
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.99 KB | None | 0 0
  1. import feedparser
  2. from newspaper import Article
  3. import sqlite3
  4. from datetime import datetime
  5. from time import sleep
  6.  
  7. # Mega-complete RSS feed list — all discussed so far
  8. rss_feeds = {
  9.     # Original & earlier feeds
  10.     "Reuters": "http://feeds.reuters.com/reuters/topNews",
  11.     "NPR": "https://www.npr.org/rss/rss.php?id=1001",
  12.     "CNN": "http://rss.cnn.com/rss/cnn_topstories.rss",
  13.     "NYTimes": "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml",
  14.     "The Guardian": "https://www.theguardian.com/world/rss",
  15.     "Ars Technica": "https://feeds.arstechnica.com/arstechnica/index",
  16.     "Wired": "https://www.wired.com/feed/rss",
  17.     "TechCrunch": "https://techcrunch.com/feed/",
  18.     "Al Jazeera": "https://www.aljazeera.com/xml/rss/all.xml",
  19.     "BBC": "http://feeds.bbci.co.uk/news/rss.xml",
  20.     "AP Top Headlines": "https://apnews.com/apf-topnews.rss",
  21.     "Bloomberg Site": "https://www.bloomberg.com/feeds/site.xml",
  22.     "NPR Politics": "https://www.npr.org/rss/rss.php?id=1014",
  23.     "Washington Post National": "http://feeds.washingtonpost.com/rss/national",
  24.     "MIT Technology Review": "https://www.technologyreview.com/feed/",
  25.     "Scientific American": "https://www.scientificamerican.com/feed/rss/",
  26.     "Nature News": "https://www.nature.com/nature/articles?type=news&format=rss",
  27.     "NASA Breaking News": "https://www.nasa.gov/rss/dyn/breaking_news.rss",
  28.     "Financial Times": "https://www.ft.com/?format=rss",
  29.     "CNBC Top News": "https://www.cnbc.com/id/100003114/device/rss/rss.html",
  30.     "MarketWatch Top Stories": "https://www.marketwatch.com/rss/topstories",
  31.     "Politico Latest": "https://www.politico.com/rss/politics08.xml",
  32.     "The Hill": "https://thehill.com/rss/syndicator/19110",
  33.     "Inside Climate News": "https://insideclimatenews.org/feed/",
  34.     "Climate Central": "https://www.climatecentral.org/news/feed",
  35.     "ProPublica": "https://www.propublica.org/feeds/",
  36.     "Reuters Science": "http://feeds.reuters.com/reuters/scienceNews",
  37.     "USA Today": "https://rssfeeds.usatoday.com/usatoday-NewsTopStories",
  38.     "CBS News": "https://www.cbsnews.com/latest/rss/main",
  39.     "NBC News": "https://feeds.nbcnews.com/nbcnews/public/news",
  40.     "ABC News": "https://abcnews.go.com/abcnews/topstories",
  41.     "Engadget": "https://www.engadget.com/rss.xml",
  42.     "Gizmodo": "https://gizmodo.com/rss",
  43.     "Space.com": "https://www.space.com/feeds/all",
  44.     "New Scientist": "https://www.newscientist.com/feed/home/",
  45.     "Forbes": "https://www.forbes.com/real-time/feed2/",
  46.     "Reuters Business": "http://feeds.reuters.com/reuters/businessNews",
  47.     "BBC Politics": "http://feeds.bbci.co.uk/news/politics/rss.xml",
  48.     "Vox Politics": "https://www.vox.com/rss/policy/index.xml",
  49.     "Rolling Stone": "https://www.rollingstone.com/feed/",
  50.     "Variety": "https://variety.com/feed/",
  51.     "The Economist": "https://www.economist.com/the-world-this-week/rss.xml",
  52.     "Wall Street Journal": "https://feeds.a.dj.com/rss/RSSWorldNews.xml",
  53.     "France 24": "https://www.france24.com/en/rss",
  54.     "Deutsche Welle": "https://rss.dw.com/xml/rss-en-world",
  55.     "IEEE Spectrum": "https://spectrum.ieee.org/rss/fulltext",
  56.     "ScienceDaily": "https://www.sciencedaily.com/rss/top/science.xml",
  57.     "Live Science": "https://www.livescience.com/feeds/all",
  58.     "Business Insider": "https://www.businessinsider.com/rss",
  59.     "Investopedia News": "https://www.investopedia.com/feedbuilder/feed/getfeed/?feedName=news",
  60.     "Brookings Institution": "https://www.brookings.edu/feed/",
  61.     "Environmental News Network": "https://www.enn.com/rss",
  62.     "Earth Island Journal": "https://earthisland.org/journal/feed",
  63.     "Hollywood Reporter": "https://www.hollywoodreporter.com/t/feed/",
  64.     "Pitchfork": "https://pitchfork.com/rss/news/",
  65.     "Le Monde (French)": "https://www.lemonde.fr/rss/une.xml",
  66.     "Der Spiegel (German)": "https://www.spiegel.de/international/index.rss",
  67.     "NHK World (Japanese)": "https://www3.nhk.or.jp/nhkworld/en/news/rss.xml",
  68.     "RT News (Russian)": "https://www.rt.com/rss/news/",
  69.  
  70.     # Newly added extra feeds
  71.     "Chicago Tribune": "https://www.chicagotribune.com/arcio/rss/category/news/",
  72.     "Los Angeles Times": "https://www.latimes.com/local/rss2.0.xml",
  73.     "The Atlantic": "https://www.theatlantic.com/feed/all/",
  74.     "BuzzFeed News": "https://www.buzzfeednews.com/feed",
  75.     "KQED": "https://www.kqed.org/rss/news",
  76.     "South China Morning Post": "https://www.scmp.com/rss/91/feed",
  77.     "Japan Times": "https://www.japantimes.co.jp/feed/",
  78.     "The Hindu": "https://www.thehindu.com/news/national/feeder/default.rss",
  79.     "Seeking Alpha": "https://seekingalpha.com/feed.xml",
  80.     "TheStreet": "https://www.thestreet.com/rss/frontpage",
  81.     "Zero Hedge": "https://www.zerohedge.com/fullrss",
  82.     "SCOTUSblog": "https://www.scotusblog.com/feed/",
  83.     "Law360": "https://www.law360.com/rss/news",
  84.     "Smithsonian Magazine": "https://www.smithsonianmag.com/feed/rss",
  85.     "Artforum": "https://www.artforum.com/feed",
  86.     "Medscape": "https://www.medscape.com/rss",
  87.     "CDC Newsroom": "https://tools.cdc.gov/api/v2/resources/media/405952.rss",
  88. }
  89.  
  90. # SQLite DB setup
  91. print("📦 Initializing database: news_articles.db")
  92. conn = sqlite3.connect("news_articles.db")
  93. cur = conn.cursor()
  94. cur.execute('''
  95. CREATE TABLE IF NOT EXISTS articles (
  96.    id INTEGER PRIMARY KEY AUTOINCREMENT,
  97.    title TEXT,
  98.    source TEXT,
  99.    url TEXT UNIQUE,
  100.    published TEXT,
  101.    scraped_at TEXT,
  102.    content TEXT
  103. )
  104. ''')
  105. conn.commit()
  106.  
  107. def extract_article(url):
  108.     try:
  109.         article = Article(url)
  110.         article.download()
  111.         article.parse()
  112.         return article.title, article.text
  113.     except Exception as e:
  114.         return None, f"[ERROR] {e}"
  115.  
  116. # Main scraping loop
  117. for source, feed_url in rss_feeds.items():
  118.     print(f"\n📡 FETCHING FROM: {source}")
  119.     print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
  120.     feed = feedparser.parse(feed_url)
  121.  
  122.     for entry in feed.entries[:5]:  # Limit per feed to 5 articles for politeness
  123.         url = entry.link
  124.         published = entry.get("published", "")
  125.         scraped_at = datetime.utcnow().isoformat()
  126.  
  127.         cur.execute("SELECT 1 FROM articles WHERE url = ?", (url,))
  128.         if cur.fetchone():
  129.             print(f"🔁 Skipping duplicate: {url}")
  130.             continue
  131.  
  132.         title, content = extract_article(url)
  133.         if content and not content.startswith("[ERROR]"):
  134.             cur.execute('''
  135.                INSERT INTO articles (title, source, url, published, scraped_at, content)
  136.                VALUES (?, ?, ?, ?, ?, ?)
  137.            ''', (title, source, url, published, scraped_at, content))
  138.             conn.commit()
  139.             print(f"✅ Saved: {title[:60]}...")
  140.             print(f"🔗 {url}\n")
  141.         else:
  142.             print(f"⚠️ Failed to extract: {url}")
  143.  
  144.         sleep(1)  # Polite pause to avoid hammering servers
  145.  
  146. conn.close()
  147. print("\n🎉 All done! Articles saved to 'news_articles.db'")
  148.  
Advertisement
Add Comment
Please, Sign In to add comment