Advertisement
nicuf

asa

Oct 24th, 2023
93
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 29.41 KB | None | 0 0
  1. #-------------------------------------------------------------------------------
  2. # Name: module1
  3. # Purpose:
  4. #
  5. # Author: Castel
  6. #
  7. # Created: 22/01/2022
  8. # Copyright: (c) Castel 2022
  9. # Licence: <your licence>
  10. #-------------------------------------------------------------------------------
  11.  
  12. import os
  13. import re
  14.  
  15. def read_text_from_file(file_path):
  16. """
  17. Aceasta functie returneaza continutul unui fisier.
  18. file_path: calea catre fisierul din care vrei sa citesti
  19. """
  20. with open(file_path, encoding='utf8') as f:
  21. text = f.read()
  22. return text
  23.  
  24.  
  25. def write_to_file(text, file_path):
  26. """
  27. Aceasta functie scrie un text intr-un fisier.
  28. text: textul pe care vrei sa il scrii
  29. file_path: calea catre fisierul in care vrei sa scrii
  30. """
  31. with open(file_path, 'wb') as f:
  32. f.write(text.encode('utf8', 'ignore'))
  33.  
  34.  
  35. def copiaza_continut_html(cale_fisier_html, cale_fisiere_gata): # astea sunt argumentele functiei, adica cand apelez functia
  36. # citesti textul din fisierul html
  37. text_html = read_text_from_file(cale_fisier_html)
  38. final_text = ''
  39.  
  40. # === fisier html vechi ===
  41. articol_categorie_pattern = re.compile('<!-- ARTICOL CATEGORIE START -->([\s\S]*?)<!-- ARTICOL CATEGORIE FINAL -->')
  42. articol_categorie = re.findall(articol_categorie_pattern, text_html)
  43. if len(articol_categorie) != 0:
  44. # === citire fisier model - index2.html ===
  45. text_html_model = read_text_from_file("C:\\Folder1\\index2.html")
  46. articol_categorie = articol_categorie[0]
  47.  
  48. # ==== INLOCUIRE <td><span class="den_articol"> CU <td><span class="linkMare"> ====
  49. span_pattern = re.compile('<td><span class="den_articol"><a href=\"(.*?)\" class="linkMare">(.*?)</a></span></td>')
  50. span_nou = '<td><span class="linkMare"><a href="{}" class="linkMare"><span class="den_articol">{}</span></a></span></td>'
  51. span = re.findall(span_pattern, articol_categorie)
  52. lista_span_nou = list()
  53. for i in range(len(span)):
  54. lista_span_nou.append(span_nou.format(span[i][0], span[i][1]))
  55. span_pattern = re.compile('<td><span class="den_articol"><a href=\".*?\" class="linkMare">.*?</a></span></td>')
  56. span = re.findall(span_pattern, articol_categorie)
  57. for i in range(len(span)):
  58. articol_categorie = articol_categorie.replace(span[i], lista_span_nou[i])
  59. # ==== INLOCUIRE <td><span class="den_articol"> CU <td><span class="linkMare"> ====
  60.  
  61. # ==== Informatii fisier original ====
  62. categ_link_title_pattern = re.compile('<td><span class="linkMare"><a href="(.*?)" class="linkMare"><span class="den_articol">(.*?)</span></a></span></td>')
  63. categ_link_title = re.findall(categ_link_title_pattern, articol_categorie)
  64. print("Total {} ARTICOLE".format(len(categ_link_title)))
  65. categ_date_link_title_desc_pattern = re.compile('<td class="text_dreapta">(.*?)<a href=\"(.*?)\" title=\"(.*?)\" class="external" rel="category tag">(.*?)</a>, by Neculai Fantanaru</td>')
  66. categ_date_link_title_desc = re.findall(categ_date_link_title_desc_pattern, articol_categorie)
  67. paragraf_pattern = re.compile('<p class="text_obisnuit2"><em>(.*?)</em></p>')
  68. paragraf = re.findall(paragraf_pattern, articol_categorie)
  69.  
  70. print("PARAGRAF", len(paragraf))
  71.  
  72. # === citeste mai departe - buton ===
  73. citeste_buton_pattern = re.compile('<div align="right" id="external2"><a href=\"(.*?)\">cite&#351;te mai departe </a>')
  74. citeste_buton = re.findall(citeste_buton_pattern, articol_categorie)
  75. read_more_buton_pattern = re.compile('<div align="right" id="external2"><a href=\"(.*?)\">read more </a>')
  76. read_more_buton = re.findall(read_more_buton_pattern, articol_categorie)
  77.  
  78. # === Informatii index2 ===
  79. articol_categorie_index2_pattern = re.compile('<!-- ARTICOL START -->([\s\S]*?)<!-- ARTICOL FINAL -->')
  80. articol_categorie_index2 = re.findall(articol_categorie_index2_pattern, text_html_model)
  81. if len(articol_categorie_index2) != 0:
  82. articol_categorie_index2 = articol_categorie_index2[0] # trebuie inlocuit cu toate categoriile din fisierul original
  83. # citire template pentru categorie din index2.html
  84. template_categorie = read_text_from_file("C:\\Folder1\\template_categorie.txt")
  85.  
  86. # h3 => title + description
  87. h3_pattern = re.compile('<h3 class="font-weight-normal" itemprop="name"><a href=\"(.*?)\" class="color-black">(.*?)</a></h3>')
  88. h3 = re.findall(h3_pattern, template_categorie)
  89. h3 = h3[0]
  90. # dates section din index2.html
  91. dates_section_index2_pattern = re.compile('<!--STARTDATES-->([\s\S]*?)<!--FINNISHDATES-->')
  92. dates_section_index2 = re.findall(dates_section_index2_pattern, template_categorie)
  93. dates_section_index2 = dates_section_index2[0]
  94. date_index2_pattern = re.compile('<a href="javascript:void\(0\)" class="color-black">(.*?)</a>')
  95. # date
  96. date_index2 = re.findall(date_index2_pattern, dates_section_index2)
  97. date_index2 = date_index2[0]
  98. # link / title / description
  99. link_title_desc_index2_pattern = re.compile('<a href=\"(.*?)\" title=\"(.*?)\" class="color-green font-weight-600 mx-1" id="hidden">(.*?)</a>')
  100. link_title_desc_index2 = re.findall(link_title_desc_index2_pattern, dates_section_index2)
  101. link_title_desc_index2 = link_title_desc_index2[0]
  102.  
  103. # paragraf
  104. paragraf_index2_pattern = re.compile('<p class="mb-35px color-grey line-height-25px">(.*?)</p>')
  105. paragraf_index2 = re.findall(paragraf_index2_pattern, template_categorie)
  106. paragraf_index2 = paragraf_index2[0]
  107.  
  108. # === read more ===
  109. read_more_pattern = re.compile('<a href=\"(.*?)\" class="btn-setting color-black btn-hvr-up btn-blue btn-hvr-pink">read more</a>')
  110. read_more = re.findall(read_more_pattern, template_categorie)
  111. read_more = read_more[0]
  112.  
  113. butoane = list()
  114. if len(citeste_buton) > 0:
  115. butoane = citeste_buton
  116. else:
  117. butoane = read_more_buton
  118. print("CATEGORIE", len(categ_link_title))
  119. for i in range(len(categ_link_title)):
  120. new_template = template_categorie
  121. # === facem replace cu informatiile din articolul original ===
  122. new_template_1 = new_template.replace(date_index2, categ_date_link_title_desc[i][0].replace(', in', '').strip())
  123. new_template_2 = new_template_1.replace(link_title_desc_index2[0], categ_date_link_title_desc[i][1])
  124. new_template_3 = new_template_2.replace(link_title_desc_index2[1], categ_date_link_title_desc[i][2])
  125. new_template_4 = new_template_3.replace(link_title_desc_index2[2], categ_date_link_title_desc[i][3].lstrip())
  126. new_template_5 = new_template_4.replace(paragraf_index2, paragraf[i])
  127. new_template_6 = new_template_5.replace(read_more, butoane[i])
  128. new_template_7 = new_template_6.replace(h3[0], categ_link_title[i][0])
  129. new_template_8 = new_template_7.replace(h3[1], categ_link_title[i][1])
  130. final_text = final_text + new_template_8 + '\n'
  131.  
  132. text_html_model = text_html_model.replace(articol_categorie_index2, final_text)
  133. final_text = text_html_model
  134.  
  135. # schimbare CATEGORIES index2
  136. # preluare lista fisier html
  137. lista_pattern = re.compile('<ul id="sidebarNavigation">([\s\S]*?)</ul>')
  138. lista = re.findall(lista_pattern, text_html)
  139. if len(lista) != 0:
  140. lista = lista[0]
  141. elemente_lista_pattern = re.compile('<li><a href=\"(.*?)\" title=\"(.*?)\">(.*?) \((.*?)\)</a></li>')
  142. elemente_lista = re.findall(elemente_lista_pattern, lista)
  143. if elemente_lista != 0:
  144. categories_pattern = re.compile('<!-- Categories -->([\s\S]*?)<!-- BOOKS START -->')
  145. categories = re.findall(categories_pattern, final_text)
  146. if len(categories) != 0:
  147. categories = categories[0]
  148. elemente_lista_model_pattern = re.compile('<div class="categories-name">([\s\S]*?)</div>')
  149. elemente_lista_model = re.findall(elemente_lista_model_pattern, categories)
  150. template_category = read_text_from_file('C:\\Folder1\\category-name.txt')
  151.  
  152. for i in range(len(elemente_lista_model)):
  153. new_template_category = template_category
  154. a_pattern = re.compile('<a href=\"(.*?)\" title=\"(.*?)\">')
  155. a = re.findall(a_pattern, new_template_category)[0]
  156. p_pattern = re.compile('<p class="font-16 color-grey text-capitalize"><i class="fa fa-angle-right font-14 color-blue mr-1"></i> (.*?) <span>(.*?)</span> </p>')
  157. p = re.findall(p_pattern, new_template_category)[0]
  158. new_template_category = new_template_category.replace(a[0], elemente_lista[i][0])
  159. new_template_category = new_template_category.replace(a[1], elemente_lista[i][1])
  160. new_template_category = new_template_category.replace(p[0], elemente_lista[i][2])
  161. new_template_category = new_template_category.replace(p[1], elemente_lista[i][3])
  162. # print(final_text)
  163. final_text = final_text.replace(elemente_lista_model[i], new_template_category)
  164.  
  165. else:
  166. print("No categories + books start")
  167. else:
  168. print("Niciun element <li>.")
  169. else:
  170. print("Tag <ul> gol.")
  171.  
  172.  
  173. # Shimbare LINK-URI FLAGS
  174. flags_pattern = re.compile('<!-- FLAGS_1 -->([\s\S]*?)<!-- FLAGS -->')
  175. flags = re.findall(flags_pattern, text_html)
  176. if len(flags) != 0:
  177. flags = flags[0]
  178. links_pattern = re.compile('<a href=\"(.*?)\">')
  179. links = re.findall(links_pattern, flags)
  180. if len(links) != 0:
  181. # print("Links: ", links)
  182. flags_model = re.findall(flags_pattern, final_text)
  183. if len(flags_model) != 0:
  184. flags_model = flags_model[0]
  185. links_pattern_model = re.compile('<li><a cunt_code=\"\+\d+\" href=\"(.*?)\">')
  186. links_model = re.findall(links_pattern_model, flags_model)
  187. if len(links_model) != 0:
  188. for i in range(len(links)):
  189. # print(links[i], links_model[i])
  190. final_text = final_text.replace(links_model[i], links[i]) # FACE REPLACE
  191. else:
  192. print("Fara links in flags model")
  193. else:
  194. print("Fara links in flags model")
  195. else:
  196. print("Fara linkuri in flags.")
  197. else:
  198. print("Fara flags in articol original.")
  199.  
  200. # STARS - PHP
  201. stars_php_pattern = re.compile('\$item_id = (.*?);')
  202. stars_php = re.findall(stars_php_pattern, text_html)
  203. stars_php_model = re.findall(stars_php_pattern, final_text)
  204. if len(stars_php) != 0:
  205. stars_php = stars_php[0]
  206. if len(stars_php_model) != 0:
  207. stars_php_model = stars_php_model[0]
  208. final_text = final_text.replace(stars_php_model, stars_php) # FACE REPLACE
  209. else:
  210. print("No stars fisier model")
  211. else:
  212. print("No stars fisier original")
  213.  
  214. # TITLE
  215. title_pattern = re.compile('<title>(.*?)</title>')
  216. text_title = re.findall(title_pattern, text_html)
  217. text_title_model = re.findall(title_pattern, final_text)
  218. if len(text_title) != 0 and len(text_title_model) != 0:
  219. text_title = text_title[0]
  220. text_title_model = text_title_model[0]
  221. final_text = final_text.replace(text_title_model, text_title)
  222. else:
  223. print("Fisier html fara tag title: {}".format(cale_fisier_html))
  224.  
  225. # DESCRIPTION
  226. description_pattern = re.compile('<meta name="description" content="(.*?)">')
  227. text_description = re.findall(description_pattern, text_html)
  228. text_description_model = re.findall(description_pattern, final_text)
  229. if len(text_description) != 0 and len(text_description_model) != 0:
  230. text_description = text_description[0]
  231. text_description_model = text_description_model[0]
  232. final_text = final_text.replace(text_description_model, text_description)
  233. else:
  234. print("Fisier html fara tag description: {}".format(cale_fisier_html))
  235.  
  236. # CANONICAL
  237. canonical_pattern = re.compile('<link rel="canonical" href="(.*?)" />')
  238. text_canonical = re.findall(canonical_pattern, text_html)
  239. text_canonical_model = re.findall(canonical_pattern, final_text)
  240. if len(text_canonical) != 0 and len(text_canonical_model) != 0:
  241. text_canonical = text_canonical[0]
  242. text_canonical_model = text_canonical_model[0]
  243. final_text = final_text.replace(text_canonical_model, text_canonical)
  244. else:
  245. print("Fisier html fara tag canonical: {}".format(cale_fisier_html))
  246.  
  247. # ULTIMELE ARTICOLE
  248. ult_art_pattern = re.compile('<!-- Ultimele articole -->([\s\S]*?)<!-- Ultimele articole final -->')
  249. ult_art_model_pattern = re.compile('<!-- Recent Post -->([\s\S]*?)<!-- Categories -->')
  250. ult_art = re.findall(ult_art_pattern, text_html)
  251. ult_art_model = re.findall(ult_art_model_pattern, final_text)
  252. if len(ult_art) != 0:
  253. ult_art = ult_art[0]
  254. if len(ult_art_model) != 0:
  255. ult_art_model = ult_art_model[0]
  256. articole_pattern = re.compile('<li><a href=\"(.*?)\">(.*?)</a></li>')
  257. articole = re.findall(articole_pattern, ult_art)
  258. if len(articole) != 0:
  259. articole_model_pattern = re.compile('<a href=\"(.*?)\" class="color-grey">(.*?)</a>')
  260. articole_model = re.findall(articole_model_pattern, ult_art_model)
  261. if len(articole_model) != 0:
  262. for i in range(len(articole)):
  263. # href - 0 / description - 1
  264. final_text = final_text.replace(articole_model[i][0], articole[i][0])
  265. final_text = final_text.replace(articole_model[i][1], articole[i][1])
  266. else:
  267. print("No articole fisier model")
  268. else:
  269. print("No articole fisier original")
  270. else:
  271. print("No lista articole fisier model")
  272. else:
  273. print("No lista articole fisier original")
  274.  
  275. else:
  276. print("Nu exista articol categorie in index2.html")
  277. else:
  278. # === citire fisier model - index.html ===
  279. text_html_model = read_text_from_file("C:\\Folder1\\index.html")
  280. # ARTICOL START - FINAL
  281. articol_pattern = re.compile('<!-- ARTICOL START -->([\s\S]*?)<!-- ARTICOL FINAL -->[\s\S]*?')
  282. text_articol = re.findall(articol_pattern, text_html)
  283. text_articol_model = re.findall(articol_pattern, text_html_model)
  284. if len(text_articol) != 0 and len(text_articol_model) != 0:
  285. text_articol = text_articol[0]
  286. text_articol_model = text_articol_model[0]
  287. text_html_model_1 = text_html_model.replace(text_articol_model, text_articol)
  288. final_text = text_html_model_1
  289. else:
  290. print("Fisier html fara ARTICOL START/FINAL: {}".format(cale_fisier_html))
  291.  
  292. # TITLE
  293. title_pattern = re.compile('<title>(.*?)</title>')
  294. text_title = re.findall(title_pattern, text_html)
  295. text_title_model = re.findall(title_pattern, text_html_model_1)
  296. if len(text_title) != 0 and len(text_title_model) != 0:
  297. text_title = text_title[0]
  298. text_title_model = text_title_model[0]
  299. text_html_model_2 = text_html_model_1.replace(text_title_model, text_title)
  300. final_text = text_html_model_2
  301. else:
  302. print("Fisier html fara tag title: {}".format(cale_fisier_html))
  303.  
  304. # DESCRIPTION
  305. description_pattern = re.compile('<meta name="description" content="(.*?)">')
  306. text_description = re.findall(description_pattern, text_html)
  307. text_description_model = re.findall(description_pattern, text_html_model_2)
  308. if len(text_description) != 0 and len(text_description_model) != 0:
  309. text_description = text_description[0]
  310. text_description_model = text_description_model[0]
  311. text_html_model_3 = text_html_model_2.replace(text_description_model, text_description)
  312. final_text = text_html_model_3
  313. else:
  314. print("Fisier html fara tag description: {}".format(cale_fisier_html))
  315.  
  316. # CANONICAL
  317. canonical_pattern = re.compile('<link rel="canonical" href="(.*?)" />')
  318. text_canonical = re.findall(canonical_pattern, text_html)
  319. text_canonical_model = re.findall(canonical_pattern, text_html_model_3)
  320. if len(text_canonical) != 0 and len(text_canonical_model) != 0:
  321. text_canonical = text_canonical[0]
  322. text_canonical_model = text_canonical_model[0]
  323. text_html_model_4 = text_html_model_3.replace(text_canonical_model, text_canonical)
  324. final_text = text_html_model_4
  325. else:
  326. print("Fisier html fara tag canonical: {}".format(cale_fisier_html))
  327.  
  328.  
  329. # remove DIV tag and TABLE tag
  330. text_articol_model = re.findall(articol_pattern, text_html_model_4)
  331. text_articol_model_old = text_articol_model[0]
  332. text_articol_model = text_articol_model[0]
  333. text_articol_model = text_articol_model.replace("<div align=\"justify\">", '')
  334. text_articol_model = text_articol_model.replace("</div>", '')
  335.  
  336. table_pattern = re.compile('<table[\s\S]*?</table>')
  337. text_table = re.findall(table_pattern, text_articol_model)
  338. if len(text_table) != 0:
  339. text_table = text_table[0]
  340. text_articol_model = text_articol_model.replace(text_table, '')
  341. text_html_model_5 = text_html_model_4.replace(text_articol_model_old, text_articol_model)
  342. final_text = text_html_model_5
  343. else:
  344. print("No text table")
  345.  
  346. # schimbare tag-uri ARTICLE TITLE
  347. article_title_pattern = re.compile('<h1 class="den_articol" itemprop="name">(.*?)</h1>')
  348. article_title = re.findall(article_title_pattern, text_articol_model_old)
  349. if len(article_title) != 0:
  350. article_title = article_title[0]
  351. h3_title_pattern = re.compile('<h3 class="font-weight-normal" itemprop="name"><a href="javascript:void\(0\)" class="color-black">(.*?)</a></h3>')
  352. h3_title = re.findall(h3_title_pattern, text_html_model_5)
  353. if len(h3_title) != 0:
  354. h3_title = h3_title[0]
  355. text_html_model_6 = text_html_model_5.replace(h3_title, article_title)
  356. final_text = text_html_model_6
  357. else:
  358. print("No h3 title.")
  359. else:
  360. print("No article title.")
  361.  
  362. # schimbare DATE
  363. date_pattern = re.compile('<td class="text_dreapta">(.*?), in <a')
  364. date = re.findall(date_pattern, text_articol_model_old)
  365. if len(date) != 0:
  366. date = date[0]
  367. # MODIFICARE 09/03
  368. date_section_pattern = re.compile('<!--STARTDATES-->([\s\S]*?)<!--FINNISHDATES-->')
  369. date_section = re.findall(date_section_pattern, text_html_model_6)
  370. if len(date_section) > 0:
  371. date_section = date_section[0]
  372. date_pattern_model = re.compile('<a href="javascript:void\(0\)" class="color-black">(.*?)</a>')
  373. date_model = re.findall(date_pattern_model, date_section)
  374. if len(date_model) != 0:
  375. date_model = date_model[0]
  376. text_html_model_7 = text_html_model_6.replace(date_model, date)
  377. final_text = text_html_model_7
  378. else:
  379. print('No date in model.')
  380. else:
  381. print("No date section: <!--STARTDATES--><!--FINNISHDATES-->")
  382. else:
  383. print("No date.")
  384.  
  385. # schimbare SECTION
  386. section_pattern_model = re.compile('<a href=\"(.*?)\" title=\"(.*?)\" class="color-green font-weight-600 mx-1" id="hidden">(.*?)</a>')
  387. section_model = re.findall(section_pattern_model, text_html_model_7)
  388. # print(section_model)
  389. if len(section_model) != 0:
  390. section_model = section_model[0]
  391. section_pattern = re.compile('<a href=\"(.*?)\" title=\"(.*?)\" class="external" rel="category tag">(.*?)</a>')
  392. section = re.findall(section_pattern, text_articol_model_old)
  393. if len(section) != 0:
  394. section = section[0]
  395. text_html_model_8 = text_html_model_7.replace(section_model[0], section[0])
  396. text_html_model_9 = text_html_model_8.replace(section_model[1], section[1])
  397. text_html_model_10 = text_html_model_9.replace(section_model[2], section[2])
  398. final_text = text_html_model_10
  399. else:
  400. print("No section.")
  401. else:
  402. print("No section model.")
  403.  
  404. # schimbare CATEGORIES
  405. # preluare lista fisier html
  406. lista_pattern = re.compile('<ul id="sidebarNavigation">([\s\S]*?)</ul>')
  407. lista = re.findall(lista_pattern, text_html)
  408. if len(lista) != 0:
  409. lista = lista[0]
  410. elemente_lista_pattern = re.compile('<li><a href=\"(.*?)\" title=\"(.*?)\">(.*?) \((.*?)\)</a></li>')
  411. elemente_lista = re.findall(elemente_lista_pattern, lista)
  412. if elemente_lista != 0:
  413. categories_pattern = re.compile('<!-- Categories -->([\s\S]*?)<!-- BOOKS START -->')
  414. categories = re.findall(categories_pattern, text_html_model_10)
  415. if len(categories) != 0:
  416. categories = categories[0]
  417. elemente_lista_model_pattern = re.compile('<div class="categories-name">([\s\S]*?)</div>')
  418. elemente_lista_model = re.findall(elemente_lista_model_pattern, categories)
  419. template_category = read_text_from_file('C:\\Folder1\\category-name.txt')
  420.  
  421. for i in range(len(elemente_lista_model)):
  422. new_template_category = template_category
  423. a_pattern = re.compile('<a href=\"(.*?)\" title=\"(.*?)\">')
  424. a = re.findall(a_pattern, new_template_category)[0]
  425. p_pattern = re.compile('<p class="font-16 color-grey text-capitalize"><i class="fa fa-angle-right font-14 color-blue mr-1"></i> (.*?) <span>(.*?)</span> </p>')
  426. p = re.findall(p_pattern, new_template_category)[0]
  427. new_template_category = new_template_category.replace(a[0], elemente_lista[i][0])
  428. new_template_category = new_template_category.replace(a[1], elemente_lista[i][1])
  429. new_template_category = new_template_category.replace(p[0], elemente_lista[i][2])
  430. new_template_category = new_template_category.replace(p[1], elemente_lista[i][3])
  431. # print(final_text)
  432. final_text = final_text.replace(elemente_lista_model[i], new_template_category)
  433. # print("==========================")
  434. # print(final_text)
  435. text_html_model_14 = final_text
  436. else:
  437. print("No categories + books start")
  438. else:
  439. print("Niciun element <li>.")
  440. else:
  441. print("Tag <ul> gol.")
  442.  
  443. # Shimbare LINK-URI FLAGS
  444. flags_pattern = re.compile('<!-- FLAGS_1 -->([\s\S]*?)<!-- FLAGS -->')
  445. flags = re.findall(flags_pattern, text_html)
  446. if len(flags) != 0:
  447. flags = flags[0]
  448. links_pattern = re.compile('<a href=\"(.*?)\">')
  449. links = re.findall(links_pattern, flags)
  450. if len(links) != 0:
  451. # print("Links: ", links)
  452. flags_model = re.findall(flags_pattern, text_html_model_14)
  453. if len(flags_model) != 0:
  454. flags_model = flags_model[0]
  455. # print("Flags: ", flags_model)
  456. links_pattern_model = re.compile('<li><a cunt_code=\"\+\d+\" href=\"(.*?)\">')
  457. links_model = re.findall(links_pattern_model, flags_model)
  458. # print(links_model)
  459. text_html_model_15 = text_html_model_14
  460. if len(links_model) != 0:
  461. for i in range(len(links)):
  462. # print(links[i], links_model[i])
  463. text_html_model_15 = text_html_model_15.replace(links_model[i], links[i]) # FACE REPLACE
  464. final_text = text_html_model_15
  465. else:
  466. print("Fara links in flags model")
  467. else:
  468. print("Fara links in flags model")
  469. else:
  470. print("Fara linkuri in flags.")
  471. else:
  472. print("Fara flags in articol original.")
  473.  
  474. # STARS - PHP
  475. stars_php_pattern = re.compile('\$item_id = (.*?);')
  476. stars_php = re.findall(stars_php_pattern, text_html)
  477. stars_php_model = re.findall(stars_php_pattern, text_html_model_15)
  478. if len(stars_php) != 0:
  479. stars_php = stars_php[0]
  480. if len(stars_php_model) != 0:
  481. stars_php_model = stars_php_model[0]
  482. text_html_model_16 = text_html_model_15.replace(stars_php_model, stars_php) # FACE REPLACE
  483. final_text = text_html_model_16
  484. else:
  485. print("No stars fisier model")
  486. else:
  487. print("No stars fisier original")
  488.  
  489. # ULTIMELE ARTICOLE
  490. ult_art_pattern = re.compile('<!-- Ultimele articole -->([\s\S]*?)<!-- Ultimele articole final -->')
  491. ult_art_model_pattern = re.compile('<!-- Recent Post -->([\s\S]*?)<!-- Categories -->')
  492. ult_art = re.findall(ult_art_pattern, text_html)
  493. ult_art_model = re.findall(ult_art_model_pattern, text_html_model_16)
  494. if len(ult_art) != 0:
  495. ult_art = ult_art[0]
  496. if len(ult_art_model) != 0:
  497. ult_art_model = ult_art_model[0]
  498. articole_pattern = re.compile('<li><a href=\"(.*?)\">(.*?)</a></li>')
  499. articole = re.findall(articole_pattern, ult_art)
  500. if len(articole) != 0:
  501. articole_model_pattern = re.compile('<a href=\"(.*?)\" class="color-grey">(.*?)</a>')
  502. articole_model = re.findall(articole_model_pattern, ult_art_model)
  503. if len(articole_model) != 0:
  504. for i in range(len(articole)):
  505. # href - 0 / description - 1
  506. # MODIFICARE 09/03
  507. final_text = final_text.replace(articole_model[i][0], articole[i][0])
  508. final_text = final_text.replace(articole_model[i][1], articole[i][1])
  509. else:
  510. print("No articole fisier model")
  511. else:
  512. print("No articole fisier original")
  513. else:
  514. print("No lista articole fisier model")
  515. else:
  516. print("No lista articole fisier original")
  517.  
  518. file_path = cale_fisiere_gata + "\\" + os.path.basename(cale_fisier_html)
  519. write_to_file(final_text, file_path)
  520. print("Scriere efectuata cu succes.")
  521.  
  522.  
  523.  
  524. def creare_fisiere_html(cale_folder_html, cale_fisiere_gata):
  525. """
  526. Functia itereaza printr-un folder care contine fisiere txt si creeaza fisiere html corespunzatoare
  527. """
  528. count = 0
  529. for f in os.listdir(cale_folder_html):
  530. if f.endswith('.html'):
  531. cale_fisier_html = cale_folder_html + "\\" + f
  532. print("FISIER CURENT: ", cale_fisier_html)
  533. copiaza_continut_html(cale_fisier_html, cale_fisiere_gata)
  534. count += 1
  535. else:
  536. continue
  537. print("Numarul de fisiere modificate: ", count)
  538.  
  539. def main():
  540. creare_fisiere_html("c:\\Folder1\\fisiere_html", "C:\\Folder1\\fisiere_gata")
  541.  
  542. if __name__ == '__main__':
  543. main()
  544.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement