Advertisement
BERKYT

Parse html file

May 6th, 2022
886
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.79 KB | None | 0 0
  1. import re
  2. import docx
  3. import codecs
  4.  
  5.  
  6. def find_table_title(line):
  7.     result = re.search(r'(?<=<tr><td width="100%" class="tabletitle"><b>)\w+', line)
  8.     if result:
  9.         return result.group(0)
  10.  
  11.  
  12. def find_table_sub_title(line):
  13.     result = re.search(r'(?<=<tr><td width="100%" class="tablesubtitle"><b>)\w+', line)
  14.     if result:
  15.         return result.group(0)
  16.  
  17.  
  18. def find_type_return(line):
  19.     result = re.search(r'(?<=<tr><td colspan="2"><span class="function"><span class="arg">)\w+', line)
  20.     if result:
  21.         return result.group(0)
  22.  
  23.  
  24. def find_name_function(line):
  25.     result = re.search(r'\w+', line)
  26.     if result:
  27.         return result.group(0)
  28.  
  29.  
  30. def find_description_function(line):
  31.     result = re.search(r'(?<=<p>)[\w+\s]+', line)
  32.     if result:
  33.         return result.group(0)
  34.  
  35.  
  36. def find_args(line):
  37.     type_arg = re.findall(r'(?<=<span class="arg">)\w+', line)
  38.     name_arg = re.findall(r'(?<=</span> )\w+', line)
  39.     result = dict()
  40.  
  41.     for key, value in zip(type_arg, name_arg):
  42.         result[key] = value
  43.  
  44.     if type_arg:
  45.         return result
  46.  
  47.  
  48. doc = docx.Document()
  49. style = doc.styles['Normal']
  50. style.font.size = docx.shared.Pt(14)
  51.  
  52. with codecs.open('function_list.htm', "r", "utf-16") as file:
  53.     with open('function_list_to_github.txt', 'w') as file_write:
  54.         for line in file:
  55.             if find_table_title(line):
  56.                 doc.add_paragraph(find_table_title(line), style='Title')
  57.                 file_write.write(f'# {find_table_title(line)}\n')
  58.             elif find_table_sub_title(line):
  59.                 doc.add_paragraph(find_table_sub_title(line), style='Title')
  60.                 file_write.write(f'# {find_table_sub_title(line)}\n')
  61.             elif find_type_return(line):
  62.                 next_line = file.readline()
  63.                 paragraph = doc.add_paragraph()
  64.                 paragraph.add_run(f'{find_type_return(line)} ').italic = True
  65.                 paragraph.add_run(find_name_function(next_line)).bold = True
  66.  
  67.                 file_write.write(f'## {find_type_return(line)} {find_name_function(next_line)}\n\n')
  68.  
  69.                 dict_args = find_args(next_line)
  70.                 file_write.write('Type | Name\n')
  71.                 file_write.write('------ | ------\n')
  72.                 if dict_args:
  73.                     for key, value in dict_args.items():
  74.                         file_write.write(f'{key} | {value}\n')
  75.                         doc.add_paragraph(f'Тип аргумента: {key}, имя аргумента: {value}', style='List Bullet')
  76.  
  77.                 file_write.write('\n')
  78.             elif find_description_function(line):
  79.                 doc.add_paragraph(find_description_function(line))
  80.                 file_write.write(f'{find_description_function(line)}\n')
  81.  
  82.  
  83. doc.save('function_list.docx')
  84.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement