Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import feedparser
- from newspaper import Article
- import sqlite3
- from datetime import datetime
- from time import sleep
- # Mega-complete RSS feed list — all discussed so far
- rss_feeds = {
- # Original & earlier feeds
- "Reuters": "http://feeds.reuters.com/reuters/topNews",
- "NPR": "https://www.npr.org/rss/rss.php?id=1001",
- "CNN": "http://rss.cnn.com/rss/cnn_topstories.rss",
- "NYTimes": "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml",
- "The Guardian": "https://www.theguardian.com/world/rss",
- "Ars Technica": "https://feeds.arstechnica.com/arstechnica/index",
- "Wired": "https://www.wired.com/feed/rss",
- "TechCrunch": "https://techcrunch.com/feed/",
- "Al Jazeera": "https://www.aljazeera.com/xml/rss/all.xml",
- "BBC": "http://feeds.bbci.co.uk/news/rss.xml",
- "AP Top Headlines": "https://apnews.com/apf-topnews.rss",
- "Bloomberg Site": "https://www.bloomberg.com/feeds/site.xml",
- "NPR Politics": "https://www.npr.org/rss/rss.php?id=1014",
- "Washington Post National": "http://feeds.washingtonpost.com/rss/national",
- "MIT Technology Review": "https://www.technologyreview.com/feed/",
- "Scientific American": "https://www.scientificamerican.com/feed/rss/",
- "Nature News": "https://www.nature.com/nature/articles?type=news&format=rss",
- "NASA Breaking News": "https://www.nasa.gov/rss/dyn/breaking_news.rss",
- "Financial Times": "https://www.ft.com/?format=rss",
- "CNBC Top News": "https://www.cnbc.com/id/100003114/device/rss/rss.html",
- "MarketWatch Top Stories": "https://www.marketwatch.com/rss/topstories",
- "Politico Latest": "https://www.politico.com/rss/politics08.xml",
- "The Hill": "https://thehill.com/rss/syndicator/19110",
- "Inside Climate News": "https://insideclimatenews.org/feed/",
- "Climate Central": "https://www.climatecentral.org/news/feed",
- "ProPublica": "https://www.propublica.org/feeds/",
- "Reuters Science": "http://feeds.reuters.com/reuters/scienceNews",
- "USA Today": "https://rssfeeds.usatoday.com/usatoday-NewsTopStories",
- "CBS News": "https://www.cbsnews.com/latest/rss/main",
- "NBC News": "https://feeds.nbcnews.com/nbcnews/public/news",
- "ABC News": "https://abcnews.go.com/abcnews/topstories",
- "Engadget": "https://www.engadget.com/rss.xml",
- "Gizmodo": "https://gizmodo.com/rss",
- "Space.com": "https://www.space.com/feeds/all",
- "New Scientist": "https://www.newscientist.com/feed/home/",
- "Forbes": "https://www.forbes.com/real-time/feed2/",
- "Reuters Business": "http://feeds.reuters.com/reuters/businessNews",
- "BBC Politics": "http://feeds.bbci.co.uk/news/politics/rss.xml",
- "Vox Politics": "https://www.vox.com/rss/policy/index.xml",
- "Rolling Stone": "https://www.rollingstone.com/feed/",
- "Variety": "https://variety.com/feed/",
- "The Economist": "https://www.economist.com/the-world-this-week/rss.xml",
- "Wall Street Journal": "https://feeds.a.dj.com/rss/RSSWorldNews.xml",
- "France 24": "https://www.france24.com/en/rss",
- "Deutsche Welle": "https://rss.dw.com/xml/rss-en-world",
- "IEEE Spectrum": "https://spectrum.ieee.org/rss/fulltext",
- "ScienceDaily": "https://www.sciencedaily.com/rss/top/science.xml",
- "Live Science": "https://www.livescience.com/feeds/all",
- "Business Insider": "https://www.businessinsider.com/rss",
- "Investopedia News": "https://www.investopedia.com/feedbuilder/feed/getfeed/?feedName=news",
- "Brookings Institution": "https://www.brookings.edu/feed/",
- "Environmental News Network": "https://www.enn.com/rss",
- "Earth Island Journal": "https://earthisland.org/journal/feed",
- "Hollywood Reporter": "https://www.hollywoodreporter.com/t/feed/",
- "Pitchfork": "https://pitchfork.com/rss/news/",
- "Le Monde (French)": "https://www.lemonde.fr/rss/une.xml",
- "Der Spiegel (German)": "https://www.spiegel.de/international/index.rss",
- "NHK World (Japanese)": "https://www3.nhk.or.jp/nhkworld/en/news/rss.xml",
- "RT News (Russian)": "https://www.rt.com/rss/news/",
- # Newly added extra feeds
- "Chicago Tribune": "https://www.chicagotribune.com/arcio/rss/category/news/",
- "Los Angeles Times": "https://www.latimes.com/local/rss2.0.xml",
- "The Atlantic": "https://www.theatlantic.com/feed/all/",
- "BuzzFeed News": "https://www.buzzfeednews.com/feed",
- "KQED": "https://www.kqed.org/rss/news",
- "South China Morning Post": "https://www.scmp.com/rss/91/feed",
- "Japan Times": "https://www.japantimes.co.jp/feed/",
- "The Hindu": "https://www.thehindu.com/news/national/feeder/default.rss",
- "Seeking Alpha": "https://seekingalpha.com/feed.xml",
- "TheStreet": "https://www.thestreet.com/rss/frontpage",
- "Zero Hedge": "https://www.zerohedge.com/fullrss",
- "SCOTUSblog": "https://www.scotusblog.com/feed/",
- "Law360": "https://www.law360.com/rss/news",
- "Smithsonian Magazine": "https://www.smithsonianmag.com/feed/rss",
- "Artforum": "https://www.artforum.com/feed",
- "Medscape": "https://www.medscape.com/rss",
- "CDC Newsroom": "https://tools.cdc.gov/api/v2/resources/media/405952.rss",
- }
- # SQLite DB setup
- print("📦 Initializing database: news_articles.db")
- conn = sqlite3.connect("news_articles.db")
- cur = conn.cursor()
- cur.execute('''
- CREATE TABLE IF NOT EXISTS articles (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- title TEXT,
- source TEXT,
- url TEXT UNIQUE,
- published TEXT,
- scraped_at TEXT,
- content TEXT
- )
- ''')
- conn.commit()
- def extract_article(url):
- try:
- article = Article(url)
- article.download()
- article.parse()
- return article.title, article.text
- except Exception as e:
- return None, f"[ERROR] {e}"
- # Main scraping loop
- for source, feed_url in rss_feeds.items():
- print(f"\n📡 FETCHING FROM: {source}")
- print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
- feed = feedparser.parse(feed_url)
- for entry in feed.entries[:5]: # Limit per feed to 5 articles for politeness
- url = entry.link
- published = entry.get("published", "")
- scraped_at = datetime.utcnow().isoformat()
- cur.execute("SELECT 1 FROM articles WHERE url = ?", (url,))
- if cur.fetchone():
- print(f"🔁 Skipping duplicate: {url}")
- continue
- title, content = extract_article(url)
- if content and not content.startswith("[ERROR]"):
- cur.execute('''
- INSERT INTO articles (title, source, url, published, scraped_at, content)
- VALUES (?, ?, ?, ?, ?, ?)
- ''', (title, source, url, published, scraped_at, content))
- conn.commit()
- print(f"✅ Saved: {title[:60]}...")
- print(f"🔗 {url}\n")
- else:
- print(f"⚠️ Failed to extract: {url}")
- sleep(1) # Polite pause to avoid hammering servers
- conn.close()
- print("\n🎉 All done! Articles saved to 'news_articles.db'")
Advertisement
Add Comment
Please, Sign In to add comment