Advertisement
Guest User

barop

a guest
Apr 4th, 2020
21
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.65 KB | None | 0 0
  1. class XyzCommentsSAXHandler(ContentHandler):
  2.     def __init__(self):
  3.         super().__init__()
  4.         self.updates: List[CommentUpdate] = []
  5.  
  6.         self._inside_entry: bool = False
  7.         self._inside_comment: bool = False
  8.         self._inside_vcard: bool = False
  9.  
  10.         self._li_depth: int = 0
  11.         self._p_depth: int = 0
  12.         self._div_depth: int = 0
  13.  
  14.         self._posted_time: Optional[str] = None
  15.         self._author_link: Optional[str] = None
  16.         self._comment_link_xyz: Optional[str] = None
  17.         self._comment_link_ru: Optional[str] = None
  18.         self._inner_html: str = ''
  19.  
  20.     def _reset_comment_state(self):
  21.         self._inside_entry = False
  22.         self._inside_comment = False
  23.         self._inside_vcard = False
  24.  
  25.         self._p_depth = 0
  26.         self._div_depth = 0
  27.  
  28.         self._posted_time = None
  29.         self._author_link = None
  30.         self._comment_link_xyz = None
  31.         self._comment_link_ru = None
  32.         self._inner_html = ''
  33.  
  34.     def startElementNS(self, name, qname, attrs):
  35.         elem_uri, elem_name = name
  36.         class_name = attrs.get((elem_uri, 'class'), None)
  37.  
  38.         # outside comments section
  39.         if not self._inside_entry:
  40.             if elem_name == 'li' and class_name == 'hcomment':
  41.                 self._inside_entry = True
  42.                 self._li_depth = 1
  43.             return
  44.        
  45.         # inside <div class="entry-comment">, write anything except <a> elements to inner_html
  46.         if self._inside_comment:
  47.             if elem_name == 'div':
  48.                 self._div_depth += 1
  49.             elif elem_name == 'br':  # self-closing HTML tag
  50.                 self._inner_html += '<br>'  # GK-style, without trailing slash
  51.                 return
  52.             elif elem_name == 'a':
  53.                 return
  54.  
  55.             attrs_string_list = []
  56.             for name in attrs.getQNames():
  57.                 attrs_string_list.append(f'{name}="{attrs.getValueByQName(name)}"')
  58.             elem_html = f'<{elem_name} {" ".join(attrs_string_list)}>'
  59.             self._inner_html += elem_html
  60.  
  61.             return
  62.  
  63.         # outside comment text
  64.  
  65.         if elem_name == 'p' and class_name == 'entry-info':
  66.             self._p_depth = 1
  67.             return
  68.        
  69.         # inside <p class="entry-info">
  70.         if self._p_depth > 0:
  71.             if elem_name == 'p':
  72.                 self._p_depth += 1
  73.  
  74.             elif elem_name == 'strong' and class_name == 'comment-author vcard entry-author':
  75.                 self._inside_vcard = True
  76.  
  77.             elif elem_name == 'time':
  78.                 self._posted_time = attrs.get((elem_uri, 'datetime'), None)
  79.                 if self._posted_time is None:
  80.                     raise ParseError('<time> element inside entry-info without datetime attribute')
  81.  
  82.             elif elem_name == 'a':
  83.                 if self._inside_vcard:
  84.                     self._author_link = attrs.get((elem_uri, 'href'), None)
  85.                     if self._author_link is None:
  86.                         raise ParseError('author link element without href attribute')
  87.                     return
  88.                
  89.                 if class_name == 'comment-link':
  90.                     self._comment_link_xyz = attrs.get((elem_uri, 'href'), None)
  91.                     if self._comment_link_xyz is None:
  92.                         raise ParseError('<a class="comment-link"> element without href attribute')
  93.                 elif class_name is None:
  94.                     if attrs.get((elem_uri, 'style'), None) == 'border-bottom:none':
  95.                         self._comment_link_ru = attrs.get((elem_uri, 'href'), None)
  96.                        
  97.             return
  98.  
  99.         if elem_name == 'div' and class_name == 'entry-comment':
  100.             self._inside_comment = True
  101.             self._div_depth = 1
  102.  
  103.     def _end_comment(self):
  104.         id_ru = None
  105.         id_xyz = None
  106.         user_id_ru = None
  107.         user_id_xyz = None
  108.         post_id = None
  109.  
  110.         # parse author
  111.         if self._author_link is None:  # Inho, why?..
  112.             user_id_ru = _GUEST8_RU_USER_ID
  113.         else:
  114.             match = _USER_LINK_RU_RE.match(self._author_link)
  115.             if match is not None:
  116.                 user_id_ru = int(match.group(1))
  117.             else:
  118.                 match = _USER_LINK_XYZ_RE.match(self._author_link)
  119.                 if match is not None:
  120.                     user_id_xyz = int(match.group(1))
  121.        
  122.         # parse id
  123.         if self._comment_link_xyz is not None:
  124.             match = _COMMENT_LINK_XYZ_RE.match(self._comment_link_xyz)
  125.             if match is not None:
  126.                 post_id = int(match.group(1))
  127.                 id_xyz = int(match.group(2))
  128.        
  129.         if self._comment_link_ru is not None:
  130.             match = _COMMENT_LINK_RU_RE.match(self._comment_link_ru)
  131.             if match is not None:
  132.                 post_id = int(match.group(1))
  133.                 id_ru = int(match.group(2))
  134.  
  135.         if post_id is None:
  136.             raise ParseError('No post_id found')
  137.  
  138.         if id_ru is None and id_xyz is None:
  139.             raise ParseError('No comment_id found (both id_ru and id_xyz are None)')
  140.  
  141.         if self._posted_time is None:
  142.             raise ParseError('No posted_time found')
  143.  
  144.         time_parsed = time.time()
  145.         time_posted = datetime.datetime.fromisoformat(self._posted_time).timestamp()
  146.         update = CommentUpdate(id_ru, id_xyz, post_id, self._inner_html, user_id_ru, user_id_xyz, time_posted, time_parsed)
  147.  
  148.         self.updates.append(update)
  149.         self._reset_comment_state()
  150.  
  151.     def endElementNS(self, name, qname):
  152.         elem_uri, elem_name = name
  153.         if elem_name == 'br':
  154.             return
  155.  
  156.         elif elem_name == 'li':
  157.             if self._li_depth == 1:
  158.                 self._inside_entry = False
  159.                 self._li_depth = 0
  160.                 return
  161.             elif self._li_depth > 1:
  162.                 self._li_depth -= 1
  163.            
  164.         elif elem_name == 'div':
  165.             if self._div_depth == 1:
  166.                 self._div_depth = 0
  167.                 self._end_comment()
  168.                 return  # do not write it to inner_html
  169.             elif self._div_depth > 1:
  170.                 self._div_depth -= 1
  171.  
  172.         elif elem_name == 'a':
  173.             return  # do not write it to inner_html
  174.  
  175.         elif elem_name == 'p':
  176.             self._p_depth = max(self._p_depth - 1, 0)
  177.  
  178.         elif elem_name == 'strong':
  179.             self._inside_vcard = False
  180.        
  181.         if self._inside_comment:
  182.             self._inner_html += f'</{elem_name}>'
  183.        
  184.  
  185.     def characters(self, content):
  186.         if self._inside_comment:
  187.             self._inner_html += content
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement