Untitled

# -*- coding: utf-8 -*-
__author__ = 'nina'
import re
import os
from os.path \
    import join

from operator import attrgetter
from HTMLParser import HTMLParser


class Parser(HTMLParser):
    """
    Parser HTML dokumenata

    Upotreba:
        parser = Parser()
        parser.parse(FILE_PATH)
    """
    def handle_starttag(self, tag, attrs):
        """
        Metoda beleži sadržaj href atributa

        Poziv metode vrši se implicitno prilikom nailaska na tag
        unutar HTML fajla. Ukoliko je u pitanju anchor tag, beleži
        se vrednost href atributa.

        Argumenti:
        - `tag`: naziv taga
        - `attrs`: lista atributa
        """
        if tag == 'a':
            # typecast da izbegnem looping
            attrs = dict(attrs)
            link = attrs['href']

            # ignoriši spoljnje linkove i uzmi u obzir samo html fajlove
            if not link.startswith('http'):
                # ukloni sekciju iz linka
                hash_index = link.rfind('#')
                if hash_index > -1:
                    link = link[:hash_index]

                if link.endswith('html') or link.endswith('htm'):
                    relative_path = os.path.join(self.path_root, link)
                    link_path = os.path.abspath(relative_path)
                    self.links.append(link_path)

    def handle_data(self, data):
        """
        Metoda beleži pronađene reči

        Poziv metode vrši se implicitno prilikom nailaska na sadržaj
        HTML elemenata. Sadržaj elementa se deli u reči koje se beleže
        u odgovarajuću listu.

        Argument:
        - `data`: dobijeni sadržaj elementa
        """
        stripped_text = re.sub('[\W]', ' ', data).split()
        if stripped_text:
            self.words.extend(stripped_text)

    def parse(self, path):
        """
        Metoda učitava sadržaj fajla i prosleđuje ga parseru

        Argument:
        - `path`: putanja do fajla
        """
        self.links = []
        self.words = []
        self.score = 0

        try:
            with open(path, 'r') as document:
                self.path_root = os.path.abspath(os.path.dirname(path))
                content = document.read()
                self.feed(content)

                # očisti duplikate
                self.links = list(set(self.links))

        except IOError as e:
            print e
        finally:
            return self.links, self.words

keywords = raw_input("Enter the keywords:")
keywords = keywords.strip()
keywords = keywords.split(" ")
keywords = filter(len, keywords)

#html_dir = 'D:\\python-2.7.7-docs-html\\'
html_dir = 'D:\\html\\'
html_files = []
for path, subdirs, files in os.walk(html_dir):
    for name in files:
        if ".html" in name:
            html_files.append(join(path, name))

parsed_html_files = []
for html_file in html_files:
    parsed_html_file = Parser()
    parsed_html_file.parse(html_file)
    parsed_html_files.append(parsed_html_file)

links = []
# each parsed html file needs to get it's
# own list of links to it
for i in range(len(parsed_html_files)):
    links.append([])

# find all links to each file and putting them in a list of lists (links)
# for an example, parsed_html_files[i] has it's link stored in links[i]
for i, parsed_html_file_i in enumerate(parsed_html_files):
    link_to_i = html_files[i]
    for j, parsed_html_file_j in enumerate(parsed_html_files):
        j_dir = os.path.dirname(os.path.abspath(str(html_files[j])))
        links_in_j = parsed_html_file_j.links
        for link in links_in_j:
            link_ = j_dir + link
            if link_ == link_to_i:
                links[i].append(parsed_html_file_j)

print "parsing finished"

# each parsed file has a points attribute
# which stores the information required
# to sort them later

for i, parsed_html_file in enumerate(parsed_html_files):
    try:
        key_word_not_found = True
        words = parsed_html_file.words
        # all keywords need to be present in order
        # to count them
        for keyword in keywords:
            for word_ in words:
                if keyword in word_:
                    key_word_not_found = False
            # if we don't find all of the words
            # we need to skip this
            if key_word_not_found:
                raise Exception
        for word_ in words:
                for keyword_ in keywords:
                 if keyword in word_:
                    parsed_html_file.score += 1

        for parsed_html_file_that_links_to_i in links[i]:
            try:
                words = parsed_html_file_that_links_to_i.words
                # all keywords need to be present in order
                # to count them
                for keyword in keywords:
                    for word_ in words:
                        if keyword in word_:
                            key_word_not_found = False
                    # if we don't find all of the words
                    # we need to skip this
                    if key_word_not_found:
                        raise Exception
                for word_ in words:
                        for keyword_ in keywords:
                         if keyword in word_:
                            parsed_html_file.score += 1
            except:
                continue
    except:
        continue

print "search finished, sorting results"
parsed_html_files = sorted(parsed_html_files, key=attrgetter('score'), reverse=True)

for i, parsed_html_file in enumerate(parsed_html_files):
    print "| " +  html_files[i] + " score: "  + str(parsed_html_file.score)