Advertisement
nicuf

Transfer data from a website to another website

Mar 24th, 2022
992
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 29.60 KB | None | 0 0
  1. --------------------------------------
  2. EXPLANATION:
  3.  
  4. ROMANIAN: https://neculaifantanaru.com/python-transfer-data-from-a-website-to-another-website-design.html
  5. ENGLISH: https://neculaifantanaru.com/en/python-transfer-data-from-a-website-to-another-website-design.html
  6. --------------------------------------
  7.  
  8.  
  9.  
  10. #-------------------------------------------------------------------------------
  11. # Author:      Neculai Fantanaru
  12. #
  13. # Created:     24/03/2022
  14. # Copyright:   (c) Neculai Fantanaru
  15. # Licence:     <your licence>
  16. #-------------------------------------------------------------------------------
  17.  
  18. import os
  19. import re
  20.  
  21. def read_text_from_file(file_path):
  22.     """
  23.    Aceasta functie returneaza continutul unui fisier.
  24.    file_path: calea catre fisierul din care vrei sa citesti
  25.    """
  26.     with open(file_path, encoding='utf8') as f:
  27.         text = f.read()
  28.         return text
  29.  
  30.  
  31. def write_to_file(text, file_path):
  32.     """
  33.    Aceasta functie scrie un text intr-un fisier.
  34.    text: textul pe care vrei sa il scrii
  35.    file_path: calea catre fisierul in care vrei sa scrii
  36.    """
  37.     with open(file_path, 'wb') as f:
  38.         f.write(text.encode('utf8', 'ignore'))
  39.  
  40.  
  41. def copiaza_continut_html(cale_fisier_html, cale_fisiere_gata): # astea sunt argumentele functiei, adica cand apelez functia
  42.     # citesti textul din fisierul html
  43.     text_html = read_text_from_file(cale_fisier_html)
  44.     final_text = ''
  45.  
  46.     # === fisier html vechi ===
  47.     articol_categorie_pattern = re.compile('<!-- ARTICOL CATEGORIE START -->([\s\S]*?)<!-- ARTICOL CATEGORIE FINAL -->')
  48.     articol_categorie = re.findall(articol_categorie_pattern, text_html)
  49.     if len(articol_categorie) != 0:
  50.         # === citire fisier model - index2.html ===
  51.         text_html_model = read_text_from_file("C:\\Folder1\\index2.html")
  52.         articol_categorie = articol_categorie[0]
  53.  
  54.         # ==== INLOCUIRE <td><span class="den_articol"> CU <td><span class="linkMare"> ====
  55.         span_pattern = re.compile('<td><span class="den_articol"><a href=\"(.*?)\" class="linkMare">(.*?)</a></span></td>')
  56.         span_nou = '<td><span class="linkMare"><a href="{}" class="linkMare"><span class="den_articol">{}</span></a></span></td>'
  57.         span = re.findall(span_pattern, articol_categorie)
  58.         lista_span_nou = list()
  59.         for i in range(len(span)):
  60.             lista_span_nou.append(span_nou.format(span[i][0], span[i][1]))
  61.         span_pattern = re.compile('<td><span class="den_articol"><a href=\".*?\" class="linkMare">.*?</a></span></td>')
  62.         span = re.findall(span_pattern, articol_categorie)
  63.         for i in range(len(span)):
  64.             articol_categorie = articol_categorie.replace(span[i], lista_span_nou[i])
  65.         # ==== INLOCUIRE <td><span class="den_articol"> CU <td><span class="linkMare"> ====
  66.  
  67.         # ==== Informatii fisier original ====
  68.         categ_link_title_pattern = re.compile('<td><span class="linkMare"><a href="(.*?)" class="linkMare"><span class="den_articol">(.*?)</span></a></span></td>')
  69.         categ_link_title = re.findall(categ_link_title_pattern, articol_categorie)
  70.         print("Total {} ARTICOLE".format(len(categ_link_title)))
  71.         categ_date_link_title_desc_pattern = re.compile('<td class="text_dreapta">(.*?)<a href=\"(.*?)\" title=\"(.*?)\" class="external" rel="category tag">(.*?)</a>, by Neculai Fantanaru</td>')
  72.         categ_date_link_title_desc = re.findall(categ_date_link_title_desc_pattern, articol_categorie)
  73.         paragraf_pattern = re.compile('<p class="text_obisnuit2"><em>(.*?)</em></p>')
  74.         paragraf = re.findall(paragraf_pattern, articol_categorie)
  75.  
  76.         # === citeste mai departe - buton ===
  77.         citeste_buton_pattern = re.compile('<div align="right" id="external2"><a href=\"(.*?)\">cite&#351;te mai departe </a>')
  78.         citeste_buton = re.findall(citeste_buton_pattern, articol_categorie)
  79.         read_more_buton_pattern = re.compile('<div align="right" id="external2"><a href=\"(.*?)\">read more </a>')
  80.         read_more_buton = re.findall(read_more_buton_pattern, articol_categorie)
  81.  
  82.         # === Informatii index2 ===
  83.         articol_categorie_index2_pattern = re.compile('<!-- ARTICOL START -->([\s\S]*?)<!-- ARTICOL FINAL -->')
  84.         articol_categorie_index2 = re.findall(articol_categorie_index2_pattern, text_html_model)
  85.         if len(articol_categorie_index2) != 0:
  86.             articol_categorie_index2 = articol_categorie_index2[0] # trebuie inlocuit cu toate categoriile din fisierul original
  87.             # citire template pentru categorie din index2.html
  88.             template_categorie = read_text_from_file("C:\\Folder1\\template_categorie.txt")
  89.  
  90.             # h3 => title + description
  91.             h3_pattern = re.compile('<h3 class="font-weight-normal" itemprop="name"><a href=\"(.*?)\" class="color-black">(.*?)</a></h3>')
  92.             h3 = re.findall(h3_pattern, template_categorie)
  93.             h3 = h3[0]
  94.             # dates section din index2.html
  95.             dates_section_index2_pattern = re.compile('<!--STARTDATES-->([\s\S]*?)<!--FINNISHDATES-->')
  96.             dates_section_index2 = re.findall(dates_section_index2_pattern, template_categorie)
  97.             dates_section_index2 = dates_section_index2[0]
  98.             date_index2_pattern = re.compile('<a href="javascript:void\(0\)" class="color-black">(.*?)</a>')
  99.             # date
  100.             date_index2 = re.findall(date_index2_pattern, dates_section_index2)
  101.             date_index2 = date_index2[0]
  102.             # link / title / description
  103.             link_title_desc_index2_pattern = re.compile('<a href=\"(.*?)\" title=\"(.*?)\" class="color-green font-weight-600 mx-1" id="hidden">(.*?)</a>')
  104.             link_title_desc_index2 = re.findall(link_title_desc_index2_pattern, dates_section_index2)
  105.             link_title_desc_index2 = link_title_desc_index2[0]
  106.  
  107.             # paragraf
  108.             paragraf_index2_pattern = re.compile('<p class="mb-35px color-grey line-height-25px">(.*?)</p>')
  109.             paragraf_index2 = re.findall(paragraf_index2_pattern, template_categorie)
  110.             paragraf_index2 = paragraf_index2[0]
  111.  
  112.             # === read more ===
  113.             read_more_pattern = re.compile('<a href=\"(.*?)\" class="btn-setting color-black btn-hvr-up btn-blue btn-hvr-pink">read more</a>')
  114.             read_more = re.findall(read_more_pattern, template_categorie)
  115.             read_more = read_more[0]
  116.  
  117.             butoane = list()
  118.             if len(citeste_buton) > 0:
  119.                 butoane = citeste_buton
  120.             else:
  121.                 butoane = read_more_buton
  122.  
  123.             for i in range(len(categ_link_title)):
  124.                 new_template = template_categorie
  125.                 # === facem replace cu informatiile din articolul original ===
  126.                 new_template_1 = new_template.replace(date_index2, categ_date_link_title_desc[i][0].replace(', in', '').strip())
  127.                 new_template_2 = new_template_1.replace(link_title_desc_index2[0], categ_date_link_title_desc[i][1])
  128.                 new_template_3 = new_template_2.replace(link_title_desc_index2[1], categ_date_link_title_desc[i][2])
  129.                 new_template_4 = new_template_3.replace(link_title_desc_index2[2], categ_date_link_title_desc[i][3].lstrip())
  130.                 new_template_5 = new_template_4.replace(paragraf_index2, paragraf[i])
  131.                 new_template_6 = new_template_5.replace(read_more, butoane[i])
  132.                 new_template_7 = new_template_6.replace(h3[0], categ_link_title[i][0])
  133.                 new_template_8 = new_template_7.replace(h3[1], categ_link_title[i][1])
  134.                 final_text = final_text + new_template_8 + '\n'
  135.  
  136.             text_html_model = text_html_model.replace(articol_categorie_index2, final_text)
  137.             final_text = text_html_model
  138.  
  139.             # schimbare CATEGORIES index2
  140.             # preluare lista fisier html
  141.             lista_pattern = re.compile('<ul id="sidebarNavigation">([\s\S]*?)</ul>')
  142.             lista = re.findall(lista_pattern, text_html)
  143.             if len(lista) != 0:
  144.                 lista = lista[0]
  145.                 elemente_lista_pattern = re.compile('<li><a href=\"(.*?)\" title=\"(.*?)\">(.*?) \((.*?)\)</a></li>')
  146.                 elemente_lista = re.findall(elemente_lista_pattern, lista)
  147.                 if elemente_lista != 0:
  148.                     categories_pattern = re.compile('<!-- Categories -->([\s\S]*?)<!-- BOOKS START -->')
  149.                     categories = re.findall(categories_pattern, final_text)
  150.                     if len(categories) != 0:
  151.                         categories = categories[0]
  152.                         elemente_lista_model_pattern = re.compile('<div class="categories-name">([\s\S]*?)</div>')
  153.                         elemente_lista_model = re.findall(elemente_lista_model_pattern, categories)
  154.                         template_category = read_text_from_file('C:\\Folder1\\category-name.txt')
  155.  
  156.                         for i in range(len(elemente_lista_model)):
  157.                             new_template_category = template_category
  158.                             a_pattern = re.compile('<a href=\"(.*?)\" title=\"(.*?)\">')
  159.                             a = re.findall(a_pattern, new_template_category)[0]
  160.                             p_pattern = re.compile('<p class="font-16 color-grey text-capitalize"><i class="fa fa-angle-right font-14 color-blue mr-1"></i> (.*?) <span>(.*?)</span> </p>')
  161.                             p = re.findall(p_pattern, new_template_category)[0]
  162.                             new_template_category = new_template_category.replace(a[0], elemente_lista[i][0])
  163.                             new_template_category = new_template_category.replace(a[1], elemente_lista[i][1])
  164.                             new_template_category = new_template_category.replace(p[0], elemente_lista[i][2])
  165.                             new_template_category = new_template_category.replace(p[1], elemente_lista[i][3])
  166.                             # print(final_text)
  167.                             final_text = final_text.replace(elemente_lista_model[i], new_template_category)
  168.  
  169.                     else:
  170.                         print("No categories + books start")
  171.                 else:
  172.                     print("Niciun element <li>.")
  173.             else:
  174.                 print("Tag <ul> gol.")
  175.  
  176.  
  177.             # Shimbare LINK-URI FLAGS
  178.             flags_pattern = re.compile('<!-- FLAGS_1 -->([\s\S]*?)<!-- FLAGS -->')
  179.             flags = re.findall(flags_pattern, text_html)
  180.             if len(flags) != 0:
  181.                 flags = flags[0]
  182.                 links_pattern = re.compile('<a href=\"(.*?)\">')
  183.                 links = re.findall(links_pattern, flags)
  184.                 if len(links) != 0:
  185.                     # print("Links: ", links)
  186.                     flags_model = re.findall(flags_pattern, final_text)
  187.                     if len(flags_model) != 0:
  188.                         flags_model = flags_model[0]
  189.                         links_pattern_model = re.compile('<li><a cunt_code=\"\+\d+\" href=\"(.*?)\">')
  190.                         links_model = re.findall(links_pattern_model, flags_model)
  191.                         if len(links_model) != 0:
  192.                             for i in range(len(links)):
  193.                                 # print(links[i], links_model[i])
  194.                                 final_text = final_text.replace(links_model[i], links[i]) # FACE REPLACE
  195.                         else:
  196.                             print("Fara links in flags model")
  197.                     else:
  198.                         print("Fara links in flags model")
  199.                 else:
  200.                     print("Fara linkuri in flags.")
  201.             else:
  202.                 print("Fara flags in articol original.")
  203.  
  204.             # STARS - PHP
  205.             stars_php_pattern = re.compile('\$item_id = (.*?);')
  206.             stars_php = re.findall(stars_php_pattern, text_html)
  207.             stars_php_model = re.findall(stars_php_pattern, final_text)
  208.             if len(stars_php) != 0:
  209.                 stars_php = stars_php[0]
  210.                 if len(stars_php_model) != 0:
  211.                     stars_php_model = stars_php_model[0]
  212.                     final_text = final_text.replace(stars_php_model, stars_php) # FACE REPLACE
  213.                 else:
  214.                     print("No stars fisier model")
  215.             else:
  216.                 print("No stars fisier original")
  217.  
  218.              # TITLE
  219.             title_pattern = re.compile('<title>(.*?)</title>')
  220.             text_title = re.findall(title_pattern, text_html)
  221.             text_title_model = re.findall(title_pattern, final_text)
  222.             if len(text_title) != 0 and len(text_title_model) != 0:
  223.                 text_title = text_title[0]
  224.                 text_title_model = text_title_model[0]
  225.                 final_text = final_text.replace(text_title_model, text_title)
  226.             else:
  227.                 print("Fisier html fara tag title: {}".format(cale_fisier_html))
  228.  
  229.             # DESCRIPTION
  230.             description_pattern = re.compile('<meta name="description" content="(.*?)">')
  231.             text_description = re.findall(description_pattern, text_html)
  232.             text_description_model = re.findall(description_pattern, final_text)
  233.             if len(text_description) != 0 and len(text_description_model) != 0:
  234.                 text_description = text_description[0]
  235.                 text_description_model = text_description_model[0]
  236.                 final_text = final_text.replace(text_description_model, text_description)
  237.             else:
  238.                 print("Fisier html fara tag description: {}".format(cale_fisier_html))
  239.  
  240.             # CANONICAL
  241.             canonical_pattern = re.compile('<link rel="canonical" href="(.*?)" />')
  242.             text_canonical = re.findall(canonical_pattern, text_html)
  243.             text_canonical_model = re.findall(canonical_pattern, final_text)
  244.             if len(text_canonical) != 0 and len(text_canonical_model) != 0:
  245.                 text_canonical = text_canonical[0]
  246.                 text_canonical_model = text_canonical_model[0]
  247.                 final_text = final_text.replace(text_canonical_model, text_canonical)
  248.             else:
  249.                 print("Fisier html fara tag canonical: {}".format(cale_fisier_html))
  250.  
  251.             # ULTIMELE ARTICOLE
  252.             ult_art_pattern = re.compile('<!-- Ultimele articole -->([\s\S]*?)<!-- Ultimele articole final -->')
  253.             ult_art_model_pattern = re.compile('<!-- Recent Post -->([\s\S]*?)<!-- Categories -->')
  254.             ult_art = re.findall(ult_art_pattern, text_html)
  255.             ult_art_model = re.findall(ult_art_model_pattern, final_text)
  256.             if len(ult_art) != 0:
  257.                 ult_art = ult_art[0]
  258.                 if len(ult_art_model) != 0:
  259.                     ult_art_model = ult_art_model[0]
  260.                     articole_pattern = re.compile('<li><a href=\"(.*?)\">(.*?)</a></li>')
  261.                     articole = re.findall(articole_pattern, ult_art)
  262.                     if len(articole) != 0:
  263.                         articole_model_pattern = re.compile('<a href=\"(.*?)\" class="color-grey">(.*?)</a>')
  264.                         articole_model = re.findall(articole_model_pattern, ult_art_model)
  265.                         if len(articole_model) != 0:
  266.                             for i in range(len(articole)):
  267.                                 # href - 0 / description - 1
  268.                                 final_text = final_text.replace(articole_model[i][0], articole[i][0])
  269.                                 final_text = final_text.replace(articole_model[i][1], articole[i][1])
  270.                         else:
  271.                             print("No articole fisier model")
  272.                     else:
  273.                         print("No articole fisier original")
  274.                 else:
  275.                     print("No lista articole fisier model")
  276.             else:
  277.                 print("No lista articole fisier original")
  278.  
  279.         else:
  280.             print("Nu exista articol categorie in index2.html")
  281.     else:
  282.         # === citire fisier model - index.html ===
  283.         text_html_model = read_text_from_file("C:\\Folder1\\index.html")
  284.         # ARTICOL START - FINAL
  285.         articol_pattern = re.compile('<!-- ARTICOL START -->([\s\S]*?)<!-- ARTICOL FINAL -->[\s\S]*?')
  286.         text_articol = re.findall(articol_pattern, text_html)
  287.         text_articol_model = re.findall(articol_pattern, text_html_model)
  288.         if len(text_articol) != 0 and len(text_articol_model) != 0:
  289.             text_articol = text_articol[0]
  290.             text_articol_model = text_articol_model[0]
  291.             text_html_model_1 = text_html_model.replace(text_articol_model, text_articol)
  292.             final_text = text_html_model_1
  293.         else:
  294.             print("Fisier html fara ARTICOL START/FINAL: {}".format(cale_fisier_html))
  295.  
  296.         # TITLE
  297.         title_pattern = re.compile('<title>(.*?)</title>')
  298.         text_title = re.findall(title_pattern, text_html)
  299.         text_title_model = re.findall(title_pattern, text_html_model_1)
  300.         if len(text_title) != 0 and len(text_title_model) != 0:
  301.             text_title = text_title[0]
  302.             text_title_model = text_title_model[0]
  303.             text_html_model_2 = text_html_model_1.replace(text_title_model, text_title)
  304.             final_text = text_html_model_2
  305.         else:
  306.             print("Fisier html fara tag title: {}".format(cale_fisier_html))
  307.  
  308.         # DESCRIPTION
  309.         description_pattern = re.compile('<meta name="description" content="(.*?)">')
  310.         text_description = re.findall(description_pattern, text_html)
  311.         text_description_model = re.findall(description_pattern, text_html_model_2)
  312.         if len(text_description) != 0 and len(text_description_model) != 0:
  313.             text_description = text_description[0]
  314.             text_description_model = text_description_model[0]
  315.             text_html_model_3 = text_html_model_2.replace(text_description_model, text_description)
  316.             final_text = text_html_model_3
  317.         else:
  318.             print("Fisier html fara tag description: {}".format(cale_fisier_html))
  319.  
  320.         # CANONICAL
  321.         canonical_pattern = re.compile('<link rel="canonical" href="(.*?)" />')
  322.         text_canonical = re.findall(canonical_pattern, text_html)
  323.         text_canonical_model = re.findall(canonical_pattern, text_html_model_3)
  324.         if len(text_canonical) != 0 and len(text_canonical_model) != 0:
  325.             text_canonical = text_canonical[0]
  326.             text_canonical_model = text_canonical_model[0]
  327.             text_html_model_4 = text_html_model_3.replace(text_canonical_model, text_canonical)
  328.             final_text = text_html_model_4
  329.         else:
  330.             print("Fisier html fara tag canonical: {}".format(cale_fisier_html))
  331.  
  332.  
  333.         # remove DIV tag and TABLE tag
  334.         text_articol_model = re.findall(articol_pattern, text_html_model_4)
  335.         text_articol_model_old = text_articol_model[0]
  336.         text_articol_model = text_articol_model[0]
  337.         text_articol_model = text_articol_model.replace("<div align=\"justify\">", '')
  338.         text_articol_model = text_articol_model.replace("</div>", '')
  339.  
  340.         table_pattern = re.compile('<table[\s\S]*?</table>')
  341.         text_table = re.findall(table_pattern, text_articol_model)
  342.         if len(text_table) != 0:
  343.             text_table = text_table[0]
  344.             text_articol_model = text_articol_model.replace(text_table, '')
  345.             text_html_model_5 = text_html_model_4.replace(text_articol_model_old, text_articol_model)
  346.             final_text = text_html_model_5
  347.         else:
  348.             print("No text table")
  349.  
  350.         # schimbare tag-uri ARTICLE TITLE
  351.         article_title_pattern = re.compile('<h1 class="den_articol" itemprop="name">(.*?)</h1>')
  352.         article_title = re.findall(article_title_pattern, text_articol_model_old)
  353.         if len(article_title) != 0:
  354.             article_title = article_title[0]
  355.             h3_title_pattern = re.compile('<h3 class="font-weight-normal" itemprop="name"><a href="javascript:void\(0\)" class="color-black">(.*?)</a></h3>')
  356.             h3_title = re.findall(h3_title_pattern, text_html_model_5)
  357.             if len(h3_title) != 0:
  358.                 h3_title = h3_title[0]
  359.                 text_html_model_6 = text_html_model_5.replace(h3_title, article_title)
  360.                 final_text = text_html_model_6
  361.             else:
  362.                 print("No h3 title.")
  363.         else:
  364.             print("No article title.")
  365.  
  366.         # schimbare DATE
  367.         date_pattern = re.compile('<td class="text_dreapta">(.*?), in <a')
  368.         date = re.findall(date_pattern, text_articol_model_old)
  369.         if len(date) != 0:
  370.             date = date[0]
  371.             # MODIFICARE 09/03
  372.             date_section_pattern = re.compile('<!--STARTDATES-->([\s\S]*?)<!--FINNISHDATES-->')
  373.             date_section = re.findall(date_section_pattern, text_html_model_6)
  374.             if len(date_section) > 0:
  375.                 date_section = date_section[0]
  376.                 date_pattern_model = re.compile('<a href="javascript:void\(0\)" class="color-black">(.*?)</a>')
  377.                 date_model = re.findall(date_pattern_model, date_section)
  378.                 if len(date_model) != 0:
  379.                     date_model = date_model[0]
  380.                     text_html_model_7 = text_html_model_6.replace(date_model, date)
  381.                     final_text = text_html_model_7
  382.                 else:
  383.                     print('No date in model.')
  384.             else:
  385.                 print("No date section: <!--STARTDATES--><!--FINNISHDATES-->")
  386.         else:
  387.             print("No date.")
  388.  
  389.         # schimbare SECTION
  390.         section_pattern_model = re.compile('<a href=\"(.*?)\" title=\"(.*?)\" class="color-green font-weight-600 mx-1" id="hidden">(.*?)</a>')
  391.         section_model = re.findall(section_pattern_model, text_html_model_7)
  392.         # print(section_model)
  393.         if len(section_model) != 0:
  394.             section_model = section_model[0]
  395.             section_pattern = re.compile('<a href=\"(.*?)\" title=\"(.*?)\" class="external" rel="category tag">(.*?)</a>')
  396.             section = re.findall(section_pattern, text_articol_model_old)
  397.             if len(section) != 0:
  398.                 section = section[0]
  399.                 text_html_model_8 = text_html_model_7.replace(section_model[0], section[0])
  400.                 text_html_model_9 = text_html_model_8.replace(section_model[1], section[1])
  401.                 text_html_model_10 = text_html_model_9.replace(section_model[2], section[2])
  402.                 final_text = text_html_model_10
  403.             else:
  404.                 print("No section.")
  405.         else:
  406.             print("No section model.")
  407.  
  408.         # schimbare CATEGORIES
  409.         # preluare lista fisier html
  410.         lista_pattern = re.compile('<ul id="sidebarNavigation">([\s\S]*?)</ul>')
  411.         lista = re.findall(lista_pattern, text_html)
  412.         if len(lista) != 0:
  413.             lista = lista[0]
  414.             elemente_lista_pattern = re.compile('<li><a href=\"(.*?)\" title=\"(.*?)\">(.*?) \((.*?)\)</a></li>')
  415.             elemente_lista = re.findall(elemente_lista_pattern, lista)
  416.             if elemente_lista != 0:
  417.                 categories_pattern = re.compile('<!-- Categories -->([\s\S]*?)<!-- BOOKS START -->')
  418.                 categories = re.findall(categories_pattern, text_html_model_10)
  419.                 if len(categories) != 0:
  420.                     categories = categories[0]
  421.                     elemente_lista_model_pattern = re.compile('<div class="categories-name">([\s\S]*?)</div>')
  422.                     elemente_lista_model = re.findall(elemente_lista_model_pattern, categories)
  423.                     template_category = read_text_from_file('C:\\Folder1\\category-name.txt')
  424.  
  425.                     for i in range(len(elemente_lista_model)):
  426.                         new_template_category = template_category
  427.                         a_pattern = re.compile('<a href=\"(.*?)\" title=\"(.*?)\">')
  428.                         a = re.findall(a_pattern, new_template_category)[0]
  429.                         p_pattern = re.compile('<p class="font-16 color-grey text-capitalize"><i class="fa fa-angle-right font-14 color-blue mr-1"></i> (.*?) <span>(.*?)</span> </p>')
  430.                         p = re.findall(p_pattern, new_template_category)[0]
  431.                         new_template_category = new_template_category.replace(a[0], elemente_lista[i][0])
  432.                         new_template_category = new_template_category.replace(a[1], elemente_lista[i][1])
  433.                         new_template_category = new_template_category.replace(p[0], elemente_lista[i][2])
  434.                         new_template_category = new_template_category.replace(p[1], elemente_lista[i][3])
  435.                         # print(final_text)
  436.                         final_text = final_text.replace(elemente_lista_model[i], new_template_category)
  437.                         # print("==========================")
  438.                         # print(final_text)
  439.                     text_html_model_14 = final_text
  440.                 else:
  441.                     print("No categories + books start")
  442.             else:
  443.                 print("Niciun element <li>.")
  444.         else:
  445.             print("Tag <ul> gol.")
  446.  
  447.         # Shimbare LINK-URI FLAGS
  448.         flags_pattern = re.compile('<!-- FLAGS_1 -->([\s\S]*?)<!-- FLAGS -->')
  449.         flags = re.findall(flags_pattern, text_html)
  450.         if len(flags) != 0:
  451.             flags = flags[0]
  452.             links_pattern = re.compile('<a href=\"(.*?)\">')
  453.             links = re.findall(links_pattern, flags)
  454.             if len(links) != 0:
  455.                 # print("Links: ", links)
  456.                 flags_model = re.findall(flags_pattern, text_html_model_14)
  457.                 if len(flags_model) != 0:
  458.                     flags_model = flags_model[0]
  459.                     # print("Flags: ", flags_model)
  460.                     links_pattern_model = re.compile('<li><a cunt_code=\"\+\d+\" href=\"(.*?)\">')
  461.                     links_model = re.findall(links_pattern_model, flags_model)
  462.                     # print(links_model)
  463.                     text_html_model_15 = text_html_model_14
  464.                     if len(links_model) != 0:
  465.                         for i in range(len(links)):
  466.                             # print(links[i], links_model[i])
  467.                             text_html_model_15 = text_html_model_15.replace(links_model[i], links[i]) # FACE REPLACE
  468.                             final_text = text_html_model_15
  469.                     else:
  470.                         print("Fara links in flags model")
  471.                 else:
  472.                     print("Fara links in flags model")
  473.             else:
  474.                 print("Fara linkuri in flags.")
  475.         else:
  476.             print("Fara flags in articol original.")
  477.  
  478.         # STARS - PHP
  479.         stars_php_pattern = re.compile('\$item_id = (.*?);')
  480.         stars_php = re.findall(stars_php_pattern, text_html)
  481.         stars_php_model = re.findall(stars_php_pattern, text_html_model_15)
  482.         if len(stars_php) != 0:
  483.             stars_php = stars_php[0]
  484.             if len(stars_php_model) != 0:
  485.                 stars_php_model = stars_php_model[0]
  486.                 text_html_model_16 = text_html_model_15.replace(stars_php_model, stars_php) # FACE REPLACE
  487.                 final_text = text_html_model_16
  488.             else:
  489.                 print("No stars fisier model")
  490.         else:
  491.             print("No stars fisier original")
  492.  
  493.         # ULTIMELE ARTICOLE
  494.         ult_art_pattern = re.compile('<!-- Ultimele articole -->([\s\S]*?)<!-- Ultimele articole final -->')
  495.         ult_art_model_pattern = re.compile('<!-- Recent Post -->([\s\S]*?)<!-- Categories -->')
  496.         ult_art = re.findall(ult_art_pattern, text_html)
  497.         ult_art_model = re.findall(ult_art_model_pattern, text_html_model_16)
  498.         if len(ult_art) != 0:
  499.             ult_art = ult_art[0]
  500.             if len(ult_art_model) != 0:
  501.                 ult_art_model = ult_art_model[0]
  502.                 articole_pattern = re.compile('<li><a href=\"(.*?)\">(.*?)</a></li>')
  503.                 articole = re.findall(articole_pattern, ult_art)
  504.                 if len(articole) != 0:
  505.                     articole_model_pattern = re.compile('<a href=\"(.*?)\" class="color-grey">(.*?)</a>')
  506.                     articole_model = re.findall(articole_model_pattern, ult_art_model)
  507.                     if len(articole_model) != 0:
  508.                         for i in range(len(articole)):
  509.                             # href - 0 / description - 1
  510.                             # MODIFICARE 09/03
  511.                             final_text = final_text.replace(articole_model[i][0], articole[i][0])
  512.                             final_text = final_text.replace(articole_model[i][1], articole[i][1])
  513.                     else:
  514.                         print("No articole fisier model")
  515.                 else:
  516.                     print("No articole fisier original")
  517.             else:
  518.                 print("No lista articole fisier model")
  519.         else:
  520.             print("No lista articole fisier original")
  521.  
  522.     file_path = cale_fisiere_gata + "\\" + os.path.basename(cale_fisier_html)
  523.     write_to_file(final_text, file_path)
  524.     print("Scriere efectuata cu succes.")
  525.  
  526.  
  527.  
  528. def creare_fisiere_html(cale_folder_html, cale_fisiere_gata):
  529.     """
  530.    Functia itereaza printr-un folder care contine fisiere txt si creeaza fisiere html corespunzatoare
  531.    """
  532.     count = 0
  533.     for f in os.listdir(cale_folder_html):
  534.             if f.endswith('.html'):
  535.                 cale_fisier_html = cale_folder_html + "\\" + f
  536.                 print("FISIER CURENT: ", cale_fisier_html)
  537.                 copiaza_continut_html(cale_fisier_html, cale_fisiere_gata)
  538.                 count += 1
  539.             else:
  540.                 continue
  541.     print("Numarul de fisiere modificate: ", count)
  542.  
  543. def main():
  544.     creare_fisiere_html("C:\\Folder1\\fisiere_html", "C:\\Folder1\\fisiere_gata")
  545.  
  546. if __name__ == '__main__':
  547.     main()
  548.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement