Advertisement
Guest User

Untitled

a guest
Oct 31st, 2014
153
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.41 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. import ngram
  3. from collections import Counter
  4. from bs4 import BeautifulSoup
  5. import urllib
  6. import unicodedata
  7. def find_all(a_str, sub):
  8.     start = 0
  9.     while True:
  10.         start = a_str.find(sub, start)
  11.         if start == -1: return
  12.         yield start
  13.         start += len(sub) # use start += 1 to find overlapping matches
  14. index = ngram.NGram(N=5)
  15.  
  16. opener = urllib.FancyURLopener({})
  17. for j in range(1,24):
  18.     f = opener.open("http://heritage.mod.go.th/nation/apaimanee/apaimanee"+str(j)+".htm")
  19.     s=f.read()
  20.            
  21.     occurs =list(find_all(s, 'ตอนที่'))
  22.     if j==1:
  23.         end =list(find_all(s, 'หน้าต่อไป'))
  24.     else:
  25.         end =list(find_all(s, 'ย้อนกลับ'))
  26.  
  27.     s=s[occurs[0]:end[-1]-200]
  28.     occurs =list(find_all(s, 'ตอนที่'))
  29.     occurs.append(end[-1])
  30.     docs=[]
  31.     for i in range(len(occurs)-1):
  32.         soup = BeautifulSoup(s[occurs[i]:occurs[i+1]])
  33.         all_text = ''.join(soup.findAll(text=True))
  34.         #all_text = all_text.replace("\xa0"," ")
  35.         #all_text = all_text.replace("\r\n"," ")
  36.         #all_text = all_text.replace("\n"," ")
  37.         #all_text = all_text.replace("\n\n"," ")
  38.         #all_text = all_text.replace("\n\n\n"," ")
  39.         #all_text = all_text.replace("\n\n\n\n"," ")
  40.         #all_text = all_text.replace("\n\n\n\n\n"," ")
  41.         print(all_text)
  42.     break
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement