Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- class XyzCommentsSAXHandler(ContentHandler):
- def __init__(self):
- super().__init__()
- self.updates: List[CommentUpdate] = []
- self._inside_entry: bool = False
- self._inside_comment: bool = False
- self._inside_vcard: bool = False
- self._li_depth: int = 0
- self._p_depth: int = 0
- self._div_depth: int = 0
- self._posted_time: Optional[str] = None
- self._author_link: Optional[str] = None
- self._comment_link_xyz: Optional[str] = None
- self._comment_link_ru: Optional[str] = None
- self._inner_html: str = ''
- def _reset_comment_state(self):
- self._inside_entry = False
- self._inside_comment = False
- self._inside_vcard = False
- self._p_depth = 0
- self._div_depth = 0
- self._posted_time = None
- self._author_link = None
- self._comment_link_xyz = None
- self._comment_link_ru = None
- self._inner_html = ''
- def startElementNS(self, name, qname, attrs):
- elem_uri, elem_name = name
- class_name = attrs.get((elem_uri, 'class'), None)
- # outside comments section
- if not self._inside_entry:
- if elem_name == 'li' and class_name == 'hcomment':
- self._inside_entry = True
- self._li_depth = 1
- return
- # inside <div class="entry-comment">, write anything except <a> elements to inner_html
- if self._inside_comment:
- if elem_name == 'div':
- self._div_depth += 1
- elif elem_name == 'br': # self-closing HTML tag
- self._inner_html += '<br>' # GK-style, without trailing slash
- return
- elif elem_name == 'a':
- return
- attrs_string_list = []
- for name in attrs.getQNames():
- attrs_string_list.append(f'{name}="{attrs.getValueByQName(name)}"')
- elem_html = f'<{elem_name} {" ".join(attrs_string_list)}>'
- self._inner_html += elem_html
- return
- # outside comment text
- if elem_name == 'p' and class_name == 'entry-info':
- self._p_depth = 1
- return
- # inside <p class="entry-info">
- if self._p_depth > 0:
- if elem_name == 'p':
- self._p_depth += 1
- elif elem_name == 'strong' and class_name == 'comment-author vcard entry-author':
- self._inside_vcard = True
- elif elem_name == 'time':
- self._posted_time = attrs.get((elem_uri, 'datetime'), None)
- if self._posted_time is None:
- raise ParseError('<time> element inside entry-info without datetime attribute')
- elif elem_name == 'a':
- if self._inside_vcard:
- self._author_link = attrs.get((elem_uri, 'href'), None)
- if self._author_link is None:
- raise ParseError('author link element without href attribute')
- return
- if class_name == 'comment-link':
- self._comment_link_xyz = attrs.get((elem_uri, 'href'), None)
- if self._comment_link_xyz is None:
- raise ParseError('<a class="comment-link"> element without href attribute')
- elif class_name is None:
- if attrs.get((elem_uri, 'style'), None) == 'border-bottom:none':
- self._comment_link_ru = attrs.get((elem_uri, 'href'), None)
- return
- if elem_name == 'div' and class_name == 'entry-comment':
- self._inside_comment = True
- self._div_depth = 1
- def _end_comment(self):
- id_ru = None
- id_xyz = None
- user_id_ru = None
- user_id_xyz = None
- post_id = None
- # parse author
- if self._author_link is None: # Inho, why?..
- user_id_ru = _GUEST8_RU_USER_ID
- else:
- match = _USER_LINK_RU_RE.match(self._author_link)
- if match is not None:
- user_id_ru = int(match.group(1))
- else:
- match = _USER_LINK_XYZ_RE.match(self._author_link)
- if match is not None:
- user_id_xyz = int(match.group(1))
- # parse id
- if self._comment_link_xyz is not None:
- match = _COMMENT_LINK_XYZ_RE.match(self._comment_link_xyz)
- if match is not None:
- post_id = int(match.group(1))
- id_xyz = int(match.group(2))
- if self._comment_link_ru is not None:
- match = _COMMENT_LINK_RU_RE.match(self._comment_link_ru)
- if match is not None:
- post_id = int(match.group(1))
- id_ru = int(match.group(2))
- if post_id is None:
- raise ParseError('No post_id found')
- if id_ru is None and id_xyz is None:
- raise ParseError('No comment_id found (both id_ru and id_xyz are None)')
- if self._posted_time is None:
- raise ParseError('No posted_time found')
- time_parsed = time.time()
- time_posted = datetime.datetime.fromisoformat(self._posted_time).timestamp()
- update = CommentUpdate(id_ru, id_xyz, post_id, self._inner_html, user_id_ru, user_id_xyz, time_posted, time_parsed)
- self.updates.append(update)
- self._reset_comment_state()
- def endElementNS(self, name, qname):
- elem_uri, elem_name = name
- if elem_name == 'br':
- return
- elif elem_name == 'li':
- if self._li_depth == 1:
- self._inside_entry = False
- self._li_depth = 0
- return
- elif self._li_depth > 1:
- self._li_depth -= 1
- elif elem_name == 'div':
- if self._div_depth == 1:
- self._div_depth = 0
- self._end_comment()
- return # do not write it to inner_html
- elif self._div_depth > 1:
- self._div_depth -= 1
- elif elem_name == 'a':
- return # do not write it to inner_html
- elif elem_name == 'p':
- self._p_depth = max(self._p_depth - 1, 0)
- elif elem_name == 'strong':
- self._inside_vcard = False
- if self._inside_comment:
- self._inner_html += f'</{elem_name}>'
- def characters(self, content):
- if self._inside_comment:
- self._inner_html += content
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement