Advertisement
Guest User

Untitled

a guest
Feb 27th, 2025
41
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.33 KB | None | 0 0
  1. import io
  2. import os
  3. from typing import Dict
  4. import pandas as pd
  5. from pypdf import PdfReader
  6. import pypdfium2 as pdfium # LICENSE OK
  7. from PIL import Image
  8. from tqdm import tqdm
  9.  
  10. # Caminhos das pastas e arquivos
  11. docs_folder = "docs"
  12. output_images_folder = "output_images"
  13. os.makedirs(output_images_folder, exist_ok=True)
  14.  
  15. # Carrega os CSVs
  16. docs_df = pd.read_csv("document_base.csv")
  17. invoices_df = pd.read_csv("invoice_base.csv")
  18.  
  19. # selecionar do docs_df apenas as linhas onde a coluna invoices_result.items() tem len == 1
  20. # passar para imagem cada pagina do pdf.
  21. # guardar nas images/ 1 pasta por fatura e passar o caminho de cada pasta salva.
  22.  
  23. print('before docs_df:', docs_df.shape)
  24. docs_df = docs_df[docs_df['invoices_result'].apply(lambda x: len(eval(x).items()) == 1)]
  25. print('after docs_df:', docs_df.shape)
  26.  
  27. def _fix_and_read_pdf(
  28. pdf_bytes: bytes
  29. ) -> PdfReader:
  30. """
  31. Attempt to fix PDF EOF marker issues and read the content.
  32.  
  33. Parameters
  34. ----------
  35. pdf_bytes : bytes
  36. Bytes object of the PDF file.
  37.  
  38. Returns
  39. -------
  40. PdfReader
  41. PdfReader object of the fixed PDF file.
  42. """
  43. EOF_MARKER = b'%%EOF'
  44. if EOF_MARKER in pdf_bytes:
  45. # we can remove the early %%EOF and put it at the end of the file
  46. pdf_bytes = pdf_bytes.replace(EOF_MARKER, b'')
  47. pdf_bytes = pdf_bytes + EOF_MARKER
  48. else:
  49. #find last %%
  50. last_percent_index = pdf_bytes.rfind(b'%%')
  51. if len(pdf_bytes) - last_percent_index < 10:
  52. pdf_bytes = pdf_bytes[:-last_percent_index] + EOF_MARKER
  53. else:
  54. # Some files really don't have an EOF marker
  55. pdf_bytes += EOF_MARKER
  56. return PdfReader(io.BytesIO(pdf_bytes))
  57.  
  58. def convert_pdf_to_images(
  59. pdf_bytes: bytes
  60. ) -> Dict[str, Image.Image]:
  61. """
  62. Convert a PDF document to images.
  63.  
  64. Parameters
  65. ----------
  66. pdf_bytes : bytes
  67. Bytes object of the PDF file.
  68.  
  69. Returns
  70. -------
  71. Dict[str, Image.Image]
  72. Dictionary with the images by page from the PDF
  73. """
  74. pdf_document= None
  75. try:
  76. # Convert pdf_bytes to a PdfDocument object
  77. pdf_document = pdfium.PdfDocument(pdf_bytes)
  78. dict_images = {}
  79.  
  80. for page_number, page in enumerate(pdf_document):
  81. image_id = f"PAGE_{page_number + 1}"
  82.  
  83. # Render page to a bitmap and convert it to a PIL image
  84. image_pil = page.render(scale=.5).to_pil() # scale=2 corresponds to roughly 200 DPI
  85. dict_images[image_id] = image_pil
  86.  
  87. return dict_images
  88.  
  89. except Exception as e:
  90. print('Erro no pdf')
  91.  
  92. finally:
  93. if pdf_document:
  94. pdf_document.close()
  95.  
  96. import io
  97.  
  98. pdf_df = docs_df[docs_df['filename'].str.endswith('.pdf')]
  99. image_df = docs_df[~docs_df['filename'].str.endswith('.pdf')]
  100.  
  101. images = {}
  102.  
  103. for file_path in pdf_df['filename']:
  104. try:
  105. with open(f'docs/{file_path}', 'rb') as f:
  106. original_pdf_bytes = f.read()
  107.  
  108. fixed_pdf_reader = _fix_and_read_pdf(original_pdf_bytes)
  109.  
  110. fixed_pdf_bytes = fixed_pdf_reader.stream.getvalue()
  111.  
  112. file_images = convert_pdf_to_images(fixed_pdf_bytes)
  113.  
  114. # print(f"PDF '{file_path}' converted to images: {list(file_images.values())}")
  115.  
  116. images[f'{file_path.replace(".pdf", "")}_Fatura_1'] = {'images': list(file_images.values())}
  117.  
  118. # print(images)
  119.  
  120. except Exception as e:
  121. print(f"Error processing '{file_path}': {e}")
  122.  
  123. for file_path in image_df['filename']:
  124. print(f"Processing '{file_path}'...")
  125.  
  126. file_image = [Image.open(f'docs/{file_path}')]
  127.  
  128. images[f'{file_path.replace(".JPG", "")}_Fatura_1'] = {'images': file_image}
  129.  
  130. print(f"Image '{file_path}' loaded")
  131.  
  132. for idx, row in invoices_df.iterrows():
  133. invoice_name = row['invoice_name']
  134. if invoice_name in images.keys():
  135. result_data = row.drop('invoice_name').to_dict()
  136. images[invoice_name]['result'] = result_data
  137.  
  138. #print(images)
  139.  
  140. finetune_data = []
  141.  
  142.  
  143. system_instruction = "You are an AI assistant for invoice fields extraction. You will receive the text of an invoice. You will receive a list of fields to fill in. Think step by step and extract the requested fields, always in text, one by one, with attention to detail"
  144. user_instruction = """You are an image processing language model specialized in extracting invoice details from images. Given an image of an invoice, extract the following fields:
  145.  
  146. invoice_number: The invoice number.
  147. billing_date: The billing date.
  148. supplier_vat: The supplier's VAT number.
  149. supplier_country: The supplier's country.
  150. supplier_name: The supplier's name.
  151. customer_vat: The customer's VAT number.
  152. customer_country: The customer's country.
  153. customer_address_street: The customer's address street.
  154. customer_address_zip_code: The customer's zip code.
  155. customer_address_city: The customer's city.
  156. total_invoice_without_taxes: The total amount of the invoice without taxes.
  157. total_invoice_with_taxes: The total amount of the invoice with taxes.
  158. invoice_currency: The currency of the invoice.
  159. invoice_description: A description of the invoice.
  160. invoice_type: The type of the invoice.
  161. observation: Any additional observations.
  162. If any field is not present in the image, set its value to null.
  163.  
  164. Return the extracted data as a JSON object with the keys exactly as listed above.
  165. """
  166.  
  167. for invoice_name, data in images.items():
  168. #print(invoice_name)
  169. invoice_images = data['images']
  170. invoice_data = data['result']
  171.  
  172. imagens = [{"type": "image", "image": image} for image in invoice_images]
  173.  
  174. if len(imagens) == 0:
  175. print(f"Skipping '{invoice_name}' as it has no images")
  176. continue
  177.  
  178. #imagens = [{'type': 'text', 'text': 'The following images are the invoice images'}, {'type': 'text', 'text': 'The following images are the invoice images'}]
  179.  
  180. sample_instruction = [
  181. {
  182. "role": "system",
  183. "content": [{"type": "text", "text": system_instruction}]
  184. },
  185. {
  186. "role": "user",
  187. "content": imagens + [{"type": "text", "text": user_instruction}]
  188. },
  189. {
  190. "role": "assistant",
  191. "content": [{"type": "text", "text": invoice_data} ]
  192. },
  193. ]
  194.  
  195. finetune_data.append({'messages': sample_instruction})
  196. #finetune_data.append(sample_instruction)
  197.  
  198. #print(finetune_data)
  199.  
  200. from unsloth import FastVisionModel # FastLanguageModel for LLMs
  201.  
  202. model, tokenizer = FastVisionModel.from_pretrained(
  203. "unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit",
  204. load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
  205. use_gradient_checkpointing = False #"unsloth", # True or "unsloth" for long context
  206. )
  207.  
  208. model = FastVisionModel.get_peft_model(
  209. model,
  210. finetune_vision_layers = False, # False if not finetuning vision layers
  211. finetune_language_layers = True, # False if not finetuning language layers
  212. finetune_attention_modules = True, # False if not finetuning attention layers
  213. finetune_mlp_modules = True, # False if not finetuning MLP layers
  214.  
  215. r = 16, # The larger, the higher the accuracy, but might overfit
  216. lora_alpha = 16, # Recommended alpha == r at least
  217. lora_dropout = 0,
  218. bias = "none",
  219. random_state = 3407,
  220. use_rslora = False, # We support rank stabilized LoRA
  221. loftq_config = None, # And LoftQ
  222. # target_modules = "all-linear", # Optional now! Can specify a list if needed
  223. )
  224. from unsloth import is_bf16_supported
  225. from unsloth.trainer import UnslothVisionDataCollator
  226. from trl import SFTTrainer, SFTConfig
  227.  
  228. FastVisionModel.for_training(model)
  229.  
  230. trainer = SFTTrainer(
  231. model = model,
  232. tokenizer = tokenizer,
  233. data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
  234. train_dataset = finetune_data,
  235. args = SFTConfig(
  236. per_device_train_batch_size = 1,
  237. gradient_accumulation_steps = 1,
  238. warmup_steps = 5,
  239. max_steps = 15,
  240. #num_train_epochs = 1, # Set this instead of max_steps for full training runs
  241. learning_rate = 2e-4,
  242. fp16 = not is_bf16_supported(),
  243. bf16 = is_bf16_supported(),
  244. logging_steps = 1,
  245. optim = "adamw_8bit",
  246. weight_decay = 0.01,
  247. lr_scheduler_type = "linear",
  248. seed = 3407,
  249. output_dir = "outputs",
  250. report_to = "none", # For Weights and Biases
  251.  
  252. # You MUST put the below items for vision finetuning:
  253. remove_unused_columns = False,
  254. dataset_text_field = "",
  255. dataset_kwargs = {"skip_prepare_dataset": True},
  256. dataset_num_proc = 4,
  257. max_seq_length = 2048,
  258. ),
  259. )
  260.  
  261. trainer_stats = trainer.train()
  262.  
  263. model.save_pretrained("lora_model") # Local saving
  264. tokenizer.save_pretrained("lora_model")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement