Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import json
- import codecs
- from bs4 import BeautifulSoup
- blah = 'put the rest of the path here'
- #url = 'https://forum-en.guildwars2.com/forum/professions/thief/Nerf-Wish-list/page/'
- folder_path = 'C:/Users/'+blah+'/Desktop/nerfwishlist/'
- request_list = []
- for x in range (1, 15):
- # page = urllib2.urlopen(url+"%d" % x).read()
- # print page
- target = open(folder_path+"page_%d.html" % x, 'r')
- # target.truncate()
- # target.write(page)
- page = target.read()
- soup = BeautifulSoup(page, "html.parser")
- soup.prettify()
- for post in soup.findAll('div', {'class' : 'post'}):
- header = post.find('div',{'class' : 'post-header'})
- member_html = header.find('a')
- member_full = member_html.text
- #member_number = member_full.find('span').text
- message_content = post.find('div',{'class' : 'message-content'})
- post_date = post.find('time').text
- permalink = post.find('a',{'class' : 'permalink icon'})['href']
- permalink = 'https://forum-en.guildwars2.com'+permalink
- if 'What:' in str(message_content):
- #print '====================================================================='
- recordWhat = False
- recordWhy = False
- recordThread = False
- recordSuggestion = False
- recordOrg = False
- request = {'Originally by': '', 'What': '', 'Why': '', 'Suggestion': '', 'Thread': '','Post Date': post_date,'Resurrected from':permalink}
- for chunk in message_content.findAll('p'):
- for line in chunk.stripped_strings:
- #print "line: "+line
- if (recordThread or recordWhy or recordSuggestion) and ('Originally by:' in line or 'What:' in line):
- jsonarray = json.dumps(request)
- request_list.append(jsonarray)
- recordWhat = False
- recordWhy = False
- recordThread = False
- recordSuggestion = False
- recordOrg = False
- request = {'Originally by': '', 'What': '', 'Why': '', 'Suggestion': '', 'Thread': '','Post Date': post_date,'Resurrected from':permalink}
- #print 'Dump jason since it\'s multiparted'
- if recordWhat and 'Why:' not in line and 'Suggestion:' not in line and 'Thread:' not in line and 'Originally by:' not in line:
- #print '+++recording request['What']+++'
- request['What'] = request['What'] + line
- elif recordWhy and 'What:' not in line and 'Suggestion:' not in line and 'Thread:' not in line and 'Originally by:' not in line:
- #print '+++recording request['Why']+++'
- request['Why'] = request['Why'] + line+'\n'
- elif recordSuggestion and 'What:' not in line and 'Why:' not in line and 'Thread:' not in line and 'Originally by:' not in line:
- #print '+++recording request['Suggestion']+++'
- request['Suggestion'] = request['Suggestion'] + line+'\n'
- elif recordThread and 'What:' not in line and 'Why:' not in line and 'Suggestion:' not in line and 'Originally by:' not in line:
- #print '+++recording request['Thread']+++'
- request['Thread'] = request['Thread'] + line+'\n'
- elif recordOrg and 'What:' not in line and 'Why:' not in line and 'Suggestion:' not in line and 'Thread:' not in line:
- #print '+++recording request['Originally by']+++'
- request['Originally by'] = request['Originally by'] + line+'\n'
- if 'What:' in line:
- if len(request['Originally by']) == 0:
- request['Originally by'] = member_full
- recordWhat = True
- recordWhy = False
- recordThread = False
- recordSuggestion = False
- recordOrg = False
- if len(line[len('What:'):].strip()) != 0:
- request['What'] = request['What'] + line[len('What:'):].strip()
- elif 'Why:' in line:
- recordWhy = True
- recordWhat = False
- recordThread = False
- recordSuggestion = False
- recordOrg = False
- if len(line[len('Why:'):].strip()) != 0:
- request['Why'] = request['Why'] + line[len('Why:'):].strip()+'\n'
- elif 'Suggestion:' in line:
- recordWhy = False
- recordWhat = False
- recordThread = False
- recordSuggestion = True
- recordOrg = False
- if len(line[len('Suggestion:'):].strip()) != 0:
- request['Suggestion'] = request['Suggestion'] + line[len('Suggestion:'):].strip()+'\n'
- line = ''
- elif 'Thread:' in line:
- recordWhy = False
- recordWhat = False
- recordThread = True
- recordSuggestion = False
- recordOrg = False
- if len(line[len('Thread:'):].strip()) != 0:
- request['Thread'] = request['Thread'] + line[len('Thread:'):].strip()+'\n'
- elif 'Originally by:' in line:
- recordWhy = False
- recordWhat = False
- recordThread = False
- recordSuggestion = False
- recordOrg = True
- if len(line[len('Originally by:'):].strip()) != 0:
- request['Originally by'] = request['Originally by'] + line[len('Originally by:'):].strip()+'\n'
- jsonarray = json.dumps(request)
- request_list.append(jsonarray)
- '''
- request['Why'] = request['Why'].strip()
- print "Originally by: "+request['Originally by']
- print "What: "+request['What']
- print "Why: "+request['Why']
- print "Suggestion: "+request['Suggestion']
- print "Thread: "+request['Thread']
- print '------------------------------------------------------------'
- print message_content
- print '------------------------------------------------------------'
- '''
- postString = ''
- count = 0
- currentBigPost = ''
- for request in request_list:
- postString = ''
- j = ''
- try:
- j = json.loads(request)
- except:
- print request
- exit(-1)
- postString = postString + '*Originally by:* '+j['Originally by'].strip()+'\n'
- postString = postString + '*Post date:* '+j['Post Date'].strip()+'\n'
- postString = postString + '*Resurrected from:* '+j['Resurrected from'].strip()+'\n'
- postString = postString + '*What:* '+j['What'].strip()+'\n'
- postString = postString + '*Why:*\n'+j['Why'].strip()+'\n'
- if len(j['Suggestion']) != 0:
- postString = postString + '*Suggestion:*\n'+j['Suggestion'].strip()+'\n'
- if len(j['Thread']) != 0:
- postString = postString + '*Thread:* '+j['Thread'].strip()+'\n'
- postString = postString + '\n'
- if len(currentBigPost)+len(postString) >= 4500:
- file_path = folder_path+"post_%d.txt" % count
- target = codecs.open(file_path, 'w', 'utf-8')
- target.truncate()
- target.write(currentBigPost)
- currentBigPost = ''
- count = count + 1
- currentBigPost = currentBigPost + postString
- count = count + 1
- file_path = folder_path+"post_%d.txt" % count
- target = codecs.open(file_path, 'w', 'utf-8')
- target.truncate()
- target.write(currentBigPost)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement