Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- import ngram
- from collections import Counter
- from bs4 import BeautifulSoup
- import urllib
- import unicodedata
- def find_all(a_str, sub):
- start = 0
- while True:
- start = a_str.find(sub, start)
- if start == -1: return
- yield start
- start += len(sub) # use start += 1 to find overlapping matches
- index = ngram.NGram(N=5)
- opener = urllib.FancyURLopener({})
- for j in range(1,24):
- f = opener.open("http://heritage.mod.go.th/nation/apaimanee/apaimanee"+str(j)+".htm")
- s=f.read()
- occurs =list(find_all(s, 'ตอนที่'))
- if j==1:
- end =list(find_all(s, 'หน้าต่อไป'))
- else:
- end =list(find_all(s, 'ย้อนกลับ'))
- s=s[occurs[0]:end[-1]-200]
- occurs =list(find_all(s, 'ตอนที่'))
- occurs.append(end[-1])
- docs=[]
- for i in range(len(occurs)-1):
- soup = BeautifulSoup(s[occurs[i]:occurs[i+1]])
- all_text = ''.join(soup.findAll(text=True))
- #all_text = all_text.replace("\xa0"," ")
- #all_text = all_text.replace("\r\n"," ")
- #all_text = all_text.replace("\n"," ")
- #all_text = all_text.replace("\n\n"," ")
- #all_text = all_text.replace("\n\n\n"," ")
- #all_text = all_text.replace("\n\n\n\n"," ")
- #all_text = all_text.replace("\n\n\n\n\n"," ")
- print(all_text)
- break
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement