Advertisement
Guest User

Untitled

a guest
Apr 5th, 2020
139
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.56 KB | None | 0 0
  1. #get files to list_txt
  2. import os
  3. import numpy as np
  4. import pandas as pd
  5. import pyarrow.parquet as pq
  6. import pyarrow as pa
  7. import gc
  8. import re
  9.  
  10. list_files_txt = os.listdir('./oral_city')
  11.  
  12. COLUMNS = ['DATETIME',
  13.            'MSISDN',
  14.            'IMSI',
  15.            'IMEISV',
  16.            'APN',
  17.            'SGSN_ADDR',
  18.            'RAT_TYPE',
  19.            'ULI',
  20.            'SRC',
  21.            'DST',
  22.            'PROTO',
  23.            'XLATESRC',
  24.            'MESSAGE_INFO',
  25.            'SERVICE',
  26.            'S_PORT',
  27.            'XLATESPORT']
  28.  
  29. files_counter = 0
  30. lenght_batch = 8000000
  31. list_tcp_logs = []
  32. line_counter = 0
  33. re_exp = '"([A-Za-z0-9.\s]*)\"'
  34.  
  35.  
  36. print("ready")
  37.  
  38. for file_txt in list_files_txt:
  39.     print(file_txt)
  40.     file_txt = open("./oral_city/"+file_txt, 'r')
  41.     for line in file_txt:
  42.         line_counter+=1
  43.         list_info_on_request = re.findall(re_exp, line)
  44.         list_info_on_request.insert(0, line[:25])
  45.         list_tcp_logs.append(np.array(list_info_on_request))
  46.         if line_counter == lenght_batch:
  47.             line_counter = 0
  48.             dataframe = pd.DataFrame(list_tcp_logs)
  49.             print(dataframe.head(3))
  50.             dataframe.columns = COLUMNS
  51.             table = pa.Table.from_pandas(dataframe)
  52.             pq.write_table(table, './converted_tcp_logs/file' + str(files_counter) + '.parquet', compression='snappy')
  53.             list_tcp_logs = []
  54.             dataframe = None
  55.             table = None
  56.             gc.collect()
  57.             files_counter+=1
  58.         else:
  59.             continue
  60.     file_txt.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement