Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #get files to list_txt
- import os
- import numpy as np
- import pandas as pd
- import pyarrow.parquet as pq
- import pyarrow as pa
- import gc
- import re
- list_files_txt = os.listdir('./oral_city')
- COLUMNS = ['DATETIME',
- 'MSISDN',
- 'IMSI',
- 'IMEISV',
- 'APN',
- 'SGSN_ADDR',
- 'RAT_TYPE',
- 'ULI',
- 'SRC',
- 'DST',
- 'PROTO',
- 'XLATESRC',
- 'MESSAGE_INFO',
- 'SERVICE',
- 'S_PORT',
- 'XLATESPORT']
- files_counter = 0
- lenght_batch = 8000000
- list_tcp_logs = []
- line_counter = 0
- re_exp = '"([A-Za-z0-9.\s]*)\"'
- print("ready")
- for file_txt in list_files_txt:
- print(file_txt)
- file_txt = open("./oral_city/"+file_txt, 'r')
- for line in file_txt:
- line_counter+=1
- list_info_on_request = re.findall(re_exp, line)
- list_info_on_request.insert(0, line[:25])
- list_tcp_logs.append(np.array(list_info_on_request))
- if line_counter == lenght_batch:
- line_counter = 0
- dataframe = pd.DataFrame(list_tcp_logs)
- print(dataframe.head(3))
- dataframe.columns = COLUMNS
- table = pa.Table.from_pandas(dataframe)
- pq.write_table(table, './converted_tcp_logs/file' + str(files_counter) + '.parquet', compression='snappy')
- list_tcp_logs = []
- dataframe = None
- table = None
- gc.collect()
- files_counter+=1
- else:
- continue
- file_txt.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement