Advertisement
Guest User

lolz

a guest
Sep 4th, 2015
184
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.05 KB | None | 0 0
  1. import json
  2. import codecs
  3. from bs4 import BeautifulSoup
  4.  
  5. blah = 'put the rest of the path here'
  6.  
  7. #url = 'https://forum-en.guildwars2.com/forum/professions/thief/Nerf-Wish-list/page/'
  8.  
  9. folder_path = 'C:/Users/'+blah+'/Desktop/nerfwishlist/'
  10.  
  11. request_list = []
  12.  
  13. for x in range (1, 15):
  14.    # page = urllib2.urlopen(url+"%d" % x).read()
  15.    # print page
  16.     target = open(folder_path+"page_%d.html" % x, 'r')
  17.    # target.truncate()
  18.    # target.write(page)
  19.     page = target.read()
  20.  
  21.     soup = BeautifulSoup(page, "html.parser")
  22.     soup.prettify()
  23.     for post in soup.findAll('div', {'class' : 'post'}):
  24.         header = post.find('div',{'class' : 'post-header'})
  25.  
  26.         member_html = header.find('a')
  27.         member_full = member_html.text
  28.         #member_number = member_full.find('span').text
  29.  
  30.         message_content = post.find('div',{'class' : 'message-content'})
  31.         post_date = post.find('time').text
  32.         permalink = post.find('a',{'class' : 'permalink icon'})['href']
  33.         permalink = 'https://forum-en.guildwars2.com'+permalink
  34.  
  35.         if 'What:' in str(message_content):
  36.             #print '====================================================================='
  37.             recordWhat = False
  38.             recordWhy = False
  39.             recordThread = False
  40.             recordSuggestion = False
  41.             recordOrg = False
  42.  
  43.             request = {'Originally by': '', 'What': '', 'Why': '', 'Suggestion': '', 'Thread': '','Post Date': post_date,'Resurrected from':permalink}
  44.  
  45.             for chunk in message_content.findAll('p'):
  46.                 for line in chunk.stripped_strings:
  47.                     #print "line: "+line
  48.  
  49.                     if (recordThread or recordWhy or recordSuggestion) and ('Originally by:' in line or 'What:' in line):
  50.                         jsonarray = json.dumps(request)
  51.                         request_list.append(jsonarray)
  52.                         recordWhat = False
  53.                         recordWhy = False
  54.                         recordThread = False
  55.                         recordSuggestion = False
  56.                         recordOrg = False
  57.                         request = {'Originally by': '', 'What': '', 'Why': '', 'Suggestion': '', 'Thread': '','Post Date': post_date,'Resurrected from':permalink}
  58.                         #print 'Dump jason since it\'s multiparted'
  59.  
  60.                     if recordWhat and 'Why:' not in line and 'Suggestion:' not in line and 'Thread:' not in line and 'Originally by:' not in line:
  61.                         #print '+++recording request['What']+++'
  62.                         request['What'] = request['What'] + line
  63.                     elif recordWhy and 'What:' not in line and 'Suggestion:' not in line and 'Thread:' not in line and 'Originally by:' not in line:
  64.                         #print '+++recording request['Why']+++'
  65.                         request['Why'] = request['Why'] + line+'\n'
  66.                     elif recordSuggestion and 'What:' not in line and 'Why:' not in line and 'Thread:' not in line and 'Originally by:' not in line:
  67.                         #print '+++recording request['Suggestion']+++'
  68.                         request['Suggestion'] = request['Suggestion'] + line+'\n'
  69.                     elif recordThread and 'What:' not in line and 'Why:' not in line and 'Suggestion:' not in line and 'Originally by:' not in line:
  70.                         #print '+++recording request['Thread']+++'
  71.                         request['Thread'] = request['Thread'] + line+'\n'
  72.                     elif recordOrg and 'What:' not in line and 'Why:' not in line and 'Suggestion:' not in line and 'Thread:' not in line:
  73.                         #print '+++recording request['Originally by']+++'
  74.                         request['Originally by'] = request['Originally by'] + line+'\n'
  75.  
  76.                     if 'What:' in line:
  77.                         if len(request['Originally by']) == 0:
  78.                             request['Originally by'] = member_full
  79.                         recordWhat = True
  80.                         recordWhy = False
  81.                         recordThread = False
  82.                         recordSuggestion = False
  83.                         recordOrg = False
  84.                         if len(line[len('What:'):].strip()) != 0:
  85.                             request['What'] = request['What'] + line[len('What:'):].strip()
  86.                     elif 'Why:' in line:
  87.                         recordWhy = True
  88.                         recordWhat = False
  89.                         recordThread = False
  90.                         recordSuggestion = False
  91.                         recordOrg = False
  92.                         if len(line[len('Why:'):].strip()) != 0:
  93.                             request['Why'] = request['Why'] + line[len('Why:'):].strip()+'\n'
  94.                     elif 'Suggestion:' in line:
  95.                         recordWhy = False
  96.                         recordWhat = False
  97.                         recordThread = False
  98.                         recordSuggestion = True
  99.                         recordOrg = False
  100.                         if len(line[len('Suggestion:'):].strip()) != 0:
  101.                             request['Suggestion'] = request['Suggestion'] + line[len('Suggestion:'):].strip()+'\n'
  102.                             line = ''
  103.                     elif 'Thread:' in line:
  104.                         recordWhy = False
  105.                         recordWhat = False
  106.                         recordThread = True
  107.                         recordSuggestion = False
  108.                         recordOrg = False
  109.                         if len(line[len('Thread:'):].strip()) != 0:
  110.                             request['Thread'] = request['Thread'] + line[len('Thread:'):].strip()+'\n'
  111.                     elif 'Originally by:' in line:
  112.                         recordWhy = False
  113.                         recordWhat = False
  114.                         recordThread = False
  115.                         recordSuggestion = False
  116.                         recordOrg = True
  117.                         if len(line[len('Originally by:'):].strip()) != 0:
  118.                             request['Originally by'] = request['Originally by'] + line[len('Originally by:'):].strip()+'\n'
  119.  
  120.             jsonarray = json.dumps(request)
  121.             request_list.append(jsonarray)
  122.  
  123.             '''
  124.            request['Why'] = request['Why'].strip()
  125.            print "Originally by: "+request['Originally by']
  126.            print "What: "+request['What']
  127.            print "Why: "+request['Why']
  128.            print "Suggestion: "+request['Suggestion']
  129.            print "Thread: "+request['Thread']
  130.            print '------------------------------------------------------------'
  131.            print message_content
  132.            print '------------------------------------------------------------'
  133.           '''
  134.  
  135. postString = ''
  136. count = 0
  137. currentBigPost = ''
  138. for request in request_list:
  139.     postString = ''
  140.     j = ''
  141.     try:
  142.         j = json.loads(request)
  143.     except:
  144.         print request
  145.         exit(-1)
  146.     postString = postString + '*Originally by:* '+j['Originally by'].strip()+'\n'
  147.     postString = postString + '*Post date:* '+j['Post Date'].strip()+'\n'
  148.     postString = postString + '*Resurrected from:* '+j['Resurrected from'].strip()+'\n'
  149.     postString = postString + '*What:* '+j['What'].strip()+'\n'
  150.     postString = postString + '*Why:*\n'+j['Why'].strip()+'\n'
  151.     if len(j['Suggestion']) != 0:
  152.         postString = postString + '*Suggestion:*\n'+j['Suggestion'].strip()+'\n'
  153.     if len(j['Thread']) != 0:
  154.         postString = postString + '*Thread:* '+j['Thread'].strip()+'\n'
  155.     postString = postString + '\n'
  156.  
  157.     if len(currentBigPost)+len(postString) >= 4500:
  158.         file_path = folder_path+"post_%d.txt" % count
  159.         target = codecs.open(file_path, 'w', 'utf-8')
  160.         target.truncate()
  161.         target.write(currentBigPost)
  162.         currentBigPost = ''
  163.         count = count + 1
  164.  
  165.     currentBigPost = currentBigPost + postString
  166.  
  167. count = count + 1
  168. file_path = folder_path+"post_%d.txt" % count
  169. target = codecs.open(file_path, 'w', 'utf-8')
  170. target.truncate()
  171. target.write(currentBigPost)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement