Advertisement
kernel_memory_dump

Untitled

Jun 29th, 2014
269
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.68 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. __author__ = 'nina'
  3. import re
  4. import os
  5. from os.path \
  6.     import join
  7.  
  8. from operator import attrgetter
  9. from HTMLParser import HTMLParser
  10.  
  11.  
  12. class Parser(HTMLParser):
  13.     """
  14.    Parser HTML dokumenata
  15.  
  16.    Upotreba:
  17.        parser = Parser()
  18.        parser.parse(FILE_PATH)
  19.    """
  20.     def handle_starttag(self, tag, attrs):
  21.         """
  22.        Metoda beleži sadržaj href atributa
  23.  
  24.        Poziv metode vrši se implicitno prilikom nailaska na tag
  25.        unutar HTML fajla. Ukoliko je u pitanju anchor tag, beleži
  26.        se vrednost href atributa.
  27.  
  28.        Argumenti:
  29.        - `tag`: naziv taga
  30.        - `attrs`: lista atributa
  31.        """
  32.         if tag == 'a':
  33.             # typecast da izbegnem looping
  34.             attrs = dict(attrs)
  35.             link = attrs['href']
  36.  
  37.             # ignoriši spoljnje linkove i uzmi u obzir samo html fajlove
  38.             if not link.startswith('http'):
  39.                 # ukloni sekciju iz linka
  40.                 hash_index = link.rfind('#')
  41.                 if hash_index > -1:
  42.                     link = link[:hash_index]
  43.  
  44.                 if link.endswith('html') or link.endswith('htm'):
  45.                     relative_path = os.path.join(self.path_root, link)
  46.                     link_path = os.path.abspath(relative_path)
  47.                     self.links.append(link_path)
  48.  
  49.     def handle_data(self, data):
  50.         """
  51.        Metoda beleži pronađene reči
  52.  
  53.        Poziv metode vrši se implicitno prilikom nailaska na sadržaj
  54.        HTML elemenata. Sadržaj elementa se deli u reči koje se beleže
  55.        u odgovarajuću listu.
  56.  
  57.        Argument:
  58.        - `data`: dobijeni sadržaj elementa
  59.        """
  60.         stripped_text = re.sub('[\W]', ' ', data).split()
  61.         if stripped_text:
  62.             self.words.extend(stripped_text)
  63.  
  64.     def parse(self, path):
  65.         """
  66.        Metoda učitava sadržaj fajla i prosleđuje ga parseru
  67.  
  68.        Argument:
  69.        - `path`: putanja do fajla
  70.        """
  71.         self.links = []
  72.         self.words = []
  73.         self.score = 0
  74.  
  75.         try:
  76.             with open(path, 'r') as document:
  77.                 self.path_root = os.path.abspath(os.path.dirname(path))
  78.                 content = document.read()
  79.                 self.feed(content)
  80.  
  81.                 # očisti duplikate
  82.                 self.links = list(set(self.links))
  83.  
  84.         except IOError as e:
  85.             print e
  86.         finally:
  87.             return self.links, self.words
  88.  
  89. keywords = raw_input("Enter the keywords:")
  90. keywords = keywords.strip()
  91. keywords = keywords.split(" ")
  92. keywords = filter(len, keywords)
  93.  
  94. #html_dir = 'D:\\python-2.7.7-docs-html\\'
  95. html_dir = 'D:\\html\\'
  96. html_files = []
  97. for path, subdirs, files in os.walk(html_dir):
  98.     for name in files:
  99.         if ".html" in name:
  100.             html_files.append(join(path, name))
  101.  
  102. parsed_html_files = []
  103. for html_file in html_files:
  104.     parsed_html_file = Parser()
  105.     parsed_html_file.parse(html_file)
  106.     parsed_html_files.append(parsed_html_file)
  107.  
  108. links = []
  109. # each parsed html file needs to get it's
  110. # own list of links to it
  111. for i in range(len(parsed_html_files)):
  112.     links.append([])
  113.  
  114. # find all links to each file and putting them in a list of lists (links)
  115. # for an example, parsed_html_files[i] has it's link stored in links[i]
  116. for i, parsed_html_file_i in enumerate(parsed_html_files):
  117.     link_to_i = html_files[i]
  118.     for j, parsed_html_file_j in enumerate(parsed_html_files):
  119.         j_dir = os.path.dirname(os.path.abspath(str(html_files[j])))
  120.         links_in_j = parsed_html_file_j.links
  121.         for link in links_in_j:
  122.             link_ = j_dir + link
  123.             if link_ == link_to_i:
  124.                 links[i].append(parsed_html_file_j)
  125.  
  126. print "parsing finished"
  127.  
  128. # each parsed file has a points attribute
  129. # which stores the information required
  130. # to sort them later
  131.  
  132. for i, parsed_html_file in enumerate(parsed_html_files):
  133.     try:
  134.         key_word_not_found = True
  135.         words = parsed_html_file.words
  136.         # all keywords need to be present in order
  137.         # to count them
  138.         for keyword in keywords:
  139.             for word_ in words:
  140.                 if keyword in word_:
  141.                     key_word_not_found = False
  142.             # if we don't find all of the words
  143.             # we need to skip this
  144.             if key_word_not_found:
  145.                 raise Exception
  146.         for word_ in words:
  147.                 for keyword_ in keywords:
  148.                  if keyword in word_:
  149.                     parsed_html_file.score += 1
  150.  
  151.         for parsed_html_file_that_links_to_i in links[i]:
  152.             try:
  153.                 words = parsed_html_file_that_links_to_i.words
  154.                 # all keywords need to be present in order
  155.                 # to count them
  156.                 for keyword in keywords:
  157.                     for word_ in words:
  158.                         if keyword in word_:
  159.                             key_word_not_found = False
  160.                     # if we don't find all of the words
  161.                     # we need to skip this
  162.                     if key_word_not_found:
  163.                         raise Exception
  164.                 for word_ in words:
  165.                         for keyword_ in keywords:
  166.                          if keyword in word_:
  167.                             parsed_html_file.score += 1
  168.             except:
  169.                 continue
  170.     except:
  171.         continue
  172.  
  173. print "search finished, sorting results"
  174. parsed_html_files = sorted(parsed_html_files, key=attrgetter('score'), reverse=True)
  175.  
  176. for i, parsed_html_file in enumerate(parsed_html_files):
  177.     print "| " +  html_files[i] + " score: "  + str(parsed_html_file.score)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement