Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Conversation opened. 1 unread message.
- Skip to content
- Using Gmail with screen readers
- Search
- Gmail
- COMPOSE
- Labels
- Inbox
- Starred
- Important
- Sent Mail
- Drafts
- [Imap]/Trash
- Deleted Messages
- Notes
- Personal
- Travel
- More
- Hangouts
- More
- 1 of 3
- Print all In new window
- nyt regex~
- Inbox
- x
- Torri Raines
- Attachments3:19 PM (2 minutes ago)
- to me
- The problem: the re.sub on line 38 works just fine (output is the "cleaned up" file), but leaves the
- " Please verify you're not a robot by clicking the box.
- Invalid email address. Please re-enter.
- You must select a newsletter to subscribe to.
- Sign Up
- You agree to receive occasional updates and special offers for The New York Times's products and services.
- Thank you for subscribing.
- An error has occurred. Please try again later.
- You are already subscribed to this email.
- View all New York Times newsletters.
- See Sample
- Manage Email Preferences
- Not you?
- Privacy Policy
- Opt out or contact us anytime"
- junk in the middle, which I want to get rid of. So I'm trying to do a more focused replace before it, on line 37, except it's just not doing anything. Using ctrl+F in notepad++ suggests my regex is fine, so I dunno why it won't replace. I've tried it with and without DOTALL, with and without having the regex string in its own variable, and with a couple of different beginnings for the regex string. I also want to get rid of the stuff higher up in the article that starts with "<h2 class="visually-hidden" id="newsletter-promo-heading">Newsletter Sign Up</h2>" but I just tried that for the beginning of the regex string and it didn't work either. The messy file is before any subbing.
- 3 Attachments
- Click here to Reply or Forward
- 13.47 GB (89%) of 15 GB used
- Manage
- Terms - Privacy
- Last account activity: 1 hour ago
- Details
- Torri Raines's profile photo
- Torri Raines
- Research Assistant
- Show details
- import urllib
- import re
- from collections import defaultdict
- def process_article(article_string, full_article):
- badchars = [';', ':', '!', '?', '\\', '/', '*', '"', '<', '>', '|']
- htmljunk = ['<p>', '</p>', '<b>', '</b>']
- if re.search('<meta property="article:tag" content="(.*?)" />"', article_string):
- tags = re.findall('<meta property="article:tag" content="(.*?)" />"', article_string)
- for tag in tags:
- yearsOfTags[year][tag] += 1
- else:
- print "no tags"
- #no_tags_counter += 1
- print article_string
- if re.search('<meta name="author" content="(.*?)" />', article_string, re.DOTALL):
- author = re.findall('<meta name="author" content="(.*?)" />', article_string, re.DOTALL)
- author = str(author[0])
- elif re.search('<meta name="byl" content="(.*?)" />', article_string):
- author = re.findall('<meta name="byl" content="(.*?)" />', article_string)
- author = str(author[0])
- else:
- author = "None_found"
- print article_string
- author = author.strip(' ')
- author = author.replace('By ', '')
- author = "~" + author.replace(' ', '_')
- for marker in htmljunk:
- full_article = full_article.replace(marker, '')
- junkstring = '<div class="control input-control">.*?<div id="#continues-post-newsletter"></div>'
- full_article = re.sub(junkstring, '', full_article, re.DOTALL)
- full_article = re.sub('<.*?>', '', full_article)
- if re.search('<meta property="og:title" content="(.*?)" />', article_string):
- title = re.findall('<meta property="og:title" content="(.*?)" />', article_string)
- title = str(title[0])
- elif re.search('<title>(.*?)</title>', article_string):
- title = re.findall('<title>(.*?)</title>', article_string)
- title = str(title[0])
- else:
- title = "article"
- #print article_string
- for char in badchars:
- title = title.replace(char, '')
- if len(title) > 150:
- title = title.split()
- title = title[:5]
- title = ' '.join(title)
- title = str(year) + "_" + str(month) + "_" + str(page) + "_" + title + ".txt"
- title = outfolder + "/" + title
- writeFile = open(title, 'w')
- print >>writeFile, author, full_article
- print "found article"
- outfolder = "Raw Articles/Test"
- no_tags_counter = 0
- failed = 0
- yearsOfTags = defaultdict(lambda: defaultdict(int))
- tagsOutfile = "test tags.txt"
- tagsOutfile = open(outfolder + "/" + tagsOutfile, 'w')
- baseurl = 'https://query.nytimes.com/svc/add/v1/sitesearch.json?end_date=20170103&begin_date=20170101&sort=asc&page=0&fq=document_type%3A%22article%22&facet=true'
- for year in range(2006,2007):
- #print year
- for month in range(1,13):
- month = str(month).zfill(2) #convert month formatting
- #print month
- for x in range(0,1):
- start_date = x * 10 + 1
- start_date = str(start_date).zfill(2)
- #print "start: ", start_date
- end_date = (x + 1) * 10
- #print "end: ", end_date
- #end_date = str(end_date).zfill(2)
- for page in range(1,11):
- url = 'https://query.nytimes.com/svc/add/v1/sitesearch.json?end_date=' + str(year) + str(month) + str(end_date) + '&begin_date=' + str(year) + str(month) + str(start_date) + '&sort=asc&page=' + str(page) + '&fq=document_type%3A%22article%22&facet=true'
- sitestring = str(urllib.urlopen(url).read()).replace('\\', '')
- #print sitestring
- if re.search('"web_url":"(.*?)",', sitestring):
- article_urls = re.findall('"web_url":"(.*?)",', sitestring, re.DOTALL)
- #do a search for the '"web_url":"(.*?)",' and if that returns something, do a findall for the same thing
- for article in article_urls:
- #article_url = 'https://88h6obas83.execute-api.us-east-1.amazonaws.com/dev/get_article?id=' + article
- #pass the results of the findall into a urlopen that appends the results to the end of https://88h6obas83.execute-api.us-east-1.amazonaws.com/dev/get_article?id=
- article_string = str(urllib.urlopen(article).read()).replace('\\', '')
- #print article_string
- if re.search('<p class="story-body-text story-content" data.*?">(.*?)<\p>', article_string, re.DOTALL):
- found_article = re.findall('<p class="story-body-text story-content" data.*?">(.*?)<\p>', article_string, re.DOTALL)
- full_article = ' '.join(found_article)
- process_article(article_string, full_article)
- elif re.search('<p itemprop="articleBody">(.*?)<\p>', article_string, re.DOTALL):
- found_article = re.findall('<p itemprop="articleBody">(.*?)<\p>', article_string, re.DOTALL)
- full_article = ' '.join(found_article)
- process_article(article_string, full_article)
- else:
- print article_string
- failed += 1
- yearlyOutfile = str(year) + "_tags.txt"
- yearlyOutfile = open(outfolder + "/" + yearlyOutfile, 'w')
- for year in sorted(yearsOfTags):
- print >>yearlyOutfile, "\n" + str(year)
- for tag in yearsOfTags[year]:
- print >>yearlyOutfile, "%s\t%s" % (tag, yearsOfTags[year][tag])
- print "Failed: ", failed
- print "No tags: ", no_tags_counter
- for year in sorted(yearsOfTags):
- print >>tagsOutfile, "\n" + str(year)
- for tag in yearsOfTags[year]:
- print >>tagsOutfile, "%s\t%s" % (tag, yearsOfTags[year][tag])
- NYTcrawler_no-outline.py
- Open with Google Docs
- Displaying NYTcrawler_no-outline.py.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement