Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- __author__ = 'nina'
- import re
- import os
- from os.path \
- import join
- from operator import attrgetter
- from HTMLParser import HTMLParser
- class Parser(HTMLParser):
- """
- Parser HTML dokumenata
- Upotreba:
- parser = Parser()
- parser.parse(FILE_PATH)
- """
- def handle_starttag(self, tag, attrs):
- """
- Metoda beleži sadržaj href atributa
- Poziv metode vrši se implicitno prilikom nailaska na tag
- unutar HTML fajla. Ukoliko je u pitanju anchor tag, beleži
- se vrednost href atributa.
- Argumenti:
- - `tag`: naziv taga
- - `attrs`: lista atributa
- """
- if tag == 'a':
- # typecast da izbegnem looping
- attrs = dict(attrs)
- link = attrs['href']
- # ignoriši spoljnje linkove i uzmi u obzir samo html fajlove
- if not link.startswith('http'):
- # ukloni sekciju iz linka
- hash_index = link.rfind('#')
- if hash_index > -1:
- link = link[:hash_index]
- if link.endswith('html') or link.endswith('htm'):
- relative_path = os.path.join(self.path_root, link)
- link_path = os.path.abspath(relative_path)
- self.links.append(link_path)
- def handle_data(self, data):
- """
- Metoda beleži pronađene reči
- Poziv metode vrši se implicitno prilikom nailaska na sadržaj
- HTML elemenata. Sadržaj elementa se deli u reči koje se beleže
- u odgovarajuću listu.
- Argument:
- - `data`: dobijeni sadržaj elementa
- """
- stripped_text = re.sub('[\W]', ' ', data).split()
- if stripped_text:
- self.words.extend(stripped_text)
- def parse(self, path):
- """
- Metoda učitava sadržaj fajla i prosleđuje ga parseru
- Argument:
- - `path`: putanja do fajla
- """
- self.links = []
- self.words = []
- self.score = 0
- try:
- with open(path, 'r') as document:
- self.path_root = os.path.abspath(os.path.dirname(path))
- content = document.read()
- self.feed(content)
- # očisti duplikate
- self.links = list(set(self.links))
- except IOError as e:
- print e
- finally:
- return self.links, self.words
- keywords = raw_input("Enter the keywords:")
- keywords = keywords.strip()
- keywords = keywords.split(" ")
- keywords = filter(len, keywords)
- #html_dir = 'D:\\python-2.7.7-docs-html\\'
- html_dir = 'D:\\html\\'
- html_files = []
- for path, subdirs, files in os.walk(html_dir):
- for name in files:
- if ".html" in name:
- html_files.append(join(path, name))
- parsed_html_files = []
- for html_file in html_files:
- parsed_html_file = Parser()
- parsed_html_file.parse(html_file)
- parsed_html_files.append(parsed_html_file)
- links = []
- # each parsed html file needs to get it's
- # own list of links to it
- for i in range(len(parsed_html_files)):
- links.append([])
- # find all links to each file and putting them in a list of lists (links)
- # for an example, parsed_html_files[i] has it's link stored in links[i]
- for i, parsed_html_file_i in enumerate(parsed_html_files):
- link_to_i = html_files[i]
- for j, parsed_html_file_j in enumerate(parsed_html_files):
- j_dir = os.path.dirname(os.path.abspath(str(html_files[j])))
- links_in_j = parsed_html_file_j.links
- for link in links_in_j:
- link_ = j_dir + link
- if link_ == link_to_i:
- links[i].append(parsed_html_file_j)
- print "parsing finished"
- # each parsed file has a points attribute
- # which stores the information required
- # to sort them later
- for i, parsed_html_file in enumerate(parsed_html_files):
- try:
- key_word_not_found = True
- words = parsed_html_file.words
- # all keywords need to be present in order
- # to count them
- for keyword in keywords:
- for word_ in words:
- if keyword in word_:
- key_word_not_found = False
- # if we don't find all of the words
- # we need to skip this
- if key_word_not_found:
- raise Exception
- for word_ in words:
- for keyword_ in keywords:
- if keyword in word_:
- parsed_html_file.score += 1
- for parsed_html_file_that_links_to_i in links[i]:
- try:
- words = parsed_html_file_that_links_to_i.words
- # all keywords need to be present in order
- # to count them
- for keyword in keywords:
- for word_ in words:
- if keyword in word_:
- key_word_not_found = False
- # if we don't find all of the words
- # we need to skip this
- if key_word_not_found:
- raise Exception
- for word_ in words:
- for keyword_ in keywords:
- if keyword in word_:
- parsed_html_file.score += 1
- except:
- continue
- except:
- continue
- print "search finished, sorting results"
- parsed_html_files = sorted(parsed_html_files, key=attrgetter('score'), reverse=True)
- for i, parsed_html_file in enumerate(parsed_html_files):
- print "| " + html_files[i] + " score: " + str(parsed_html_file.score)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement