Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import yaml
- from tkinter import messagebox, Label, Button, FALSE, Tk, Entry, Checkbutton, BooleanVar, StringVar
- import os
- from datetime import datetime
- from set_config import set_config
- from ftplib import FTP
- import pandas as pd
- from dateutil.parser import *
- from num2words import num2words
- import numpy as np
- #class Preprocessing
- # unit name extraction
- class from_ftp:
- #retreive files from ftp server, with login, adress, and path from yaml file
- path = ""
- login_path = ""
- remote_path = ""
- path_path = ""
- def __init__(self, **kwargs):
- self.start_str = kwargs.get('start_str')
- self.path = kwargs.get('path')
- self.str_date_start = kwargs.get('str_date_start')
- self.ftp = kwargs.get("ftp")
- self.remote_path = kwargs.get("remote_path")
- if not (os.path.exists(self.path)):
- os.mkdir(self.path)
- config_path = self.path + "/Config"
- if not(os.path.exists(config_path)):
- config = set_config(config_path)
- config.setup()
- self.login_path = config_path + "/login.yaml"
- self.data_path = kwargs.get('data_path')
- self.path_path = config_path + "/path.yaml"
- #checks: file type, whether the file has the right start, and if it isnt from today
- def get_file_date(self, name):
- split = name.split("_")
- day = split[-1][:-4]
- date = parse(split[-3] + "-" + split[-2] + "-" + day)
- return date.date()
- def filter_remote_files(self):
- relevant_files = []
- #date_start = datetime.now().date()
- now = datetime.now().date()
- #if self.str_date_start != None:
- date_start = parse(self.str_date_start).date()
- #else:
- #data_start = parse('2017-1-1').date()
- for name in self.ftp.nlst():
- if(name.startswith(self.start_str)):
- file_date = self.get_file_date(name)
- if(file_date < now and file_date >= date_start ):
- relevant_files.append(name)
- return relevant_files
- #retrieves the files from remote server
- def retrieve_files(self):
- if(self.data_path == None):
- with open(self.path_path, 'r') as stream:
- data_dict = yaml.safe_load(stream)
- data_path = data_dict.get('File')
- else:
- data_path = self.data_path
- data_path_files = []
- h_remote_files = []
- print('BUILDING LOCAL DIR FILE LIST...')
- for file_name in os.listdir(data_path):
- data_path_files.append(file_name) # populate local dir list
- cmdcmd = 'CWD ' + self.remote_path
- self.ftp.sendcmd(cmdcmd)
- print('BUILDING REMOTE DIR FILE LIST...\n')
- h_remote_files = self.filter_remote_files()
- h_diff = sorted(list(set(h_remote_files) - set(data_path_files))) # difference between two lists
- for h in h_diff:
- with open(os.path.join(data_path,h), 'wb') as ftpfile:
- s = '0'
- print('File: ' + h)
- s = self.ftp.retrbinary('RETR ' + h, lambda d: ftpfile.write(d)) # retrieve file
- if str(s).startswith('226'): # comes from ftp status: '226 Transfer complete.'
- print ('\nOK\n') # print 'OK' if transfer was successful
- else:
- print (s) # if error, print retrbinary's return
- #try to login with current user info
- #try to login with current user info
- def try_login(self, username, password):
- print("Trying to login...")
- print (username)
- try:
- self.ftp.login(username, password)
- except Exception as e:
- messagebox.showinfo("-- ERROR --", "Please enter valid infomation!", icon="warning")
- #enter user info from yaml file
- #main function, establishes shared variables, and makes sure the program can run, and all config is setup
- def ftp_sync(self):
- username = "" #mutable variable for username
- password = ""#mutable var for pwd
- with open(self.login_path, 'r') as stream:
- login_info = yaml.safe_load(stream)
- username = login_info.get("Username")
- password = login_info.get("Password")
- print(username)
- self.try_login(username, password)
- #if login is sucsessful retrieve files
- self.retrieve_files()
- class Preprocessing_Base:
- # path, alarm_name, del_cols, intended_cols, expected_cols, cols_to_fix, deviation_cols, min_cols
- def __init__(self,**kwargs):
- self.path = kwargs.get('path')
- self.from_path = kwargs.get('from_path')
- self.alarm_name = kwargs.get('alarm_name')
- self.del_cols = kwargs.get('del_cols')
- self.intended_cols = kwargs.get('intended_cols')
- self.cols_to_fix = kwargs.get('cols_to_fix')
- self.deviation_cols = kwargs.get('deviation_cols')
- self.one_strs = kwargs.get("one_strs")
- self.temp_cols = kwargs.get("temp_cols")
- self.zero_strs = kwargs.get("zero_strs")
- self.binary_cols = kwargs.get("binary_cols")
- #derives unit_name from filename
- def unit_name(self, cols, file_n, df):
- if len(cols) == len(self.intended_cols):
- # Add the Unit_Names for the file_names
- unitName = file_n.split("_")[0]
- df["Unit_Name"] = unitName.upper()
- else:
- df = df.drop(df.columns[len(self.intended_cols):],axis=1)
- # Add the Unit_Names for the file_names
- unitName = file_n.split("_")[0]
- df["Unit_Name"] = unitName.upper()
- return unitName
- #//
- #sets values to string for entire column
- def df_to_string(self, column, df):
- #alarm strin
- df[column] = df[column].astype(str)
- #/
- #changes zeroes and ones to zerostr and onestr
- def binary_to_string(self, binary_col, df):
- if(binary_col in df.columns):
- self.df_to_string(binary_col, df)
- if(binary_col[-2] == '_'):
- key_col = binary_col[:-2]
- else:
- key_col = binary_col
- n = []
- for val in df[binary_col].values:
- if(val == '1'):
- n.append(self.one_strs.get(key_col))
- elif(val == '0'):
- n.append(self.zero_strs.get(key_col))
- else:
- n = df[binary_col]
- break
- df[binary_col] = pd.DataFrame(n)
- def binary_col_array(self, binary_cols, df):
- for binary_col in binary_cols:
- self.binary_to_string(binary_col, df)
- #old more specific code I dont feel like replacingough del_cols and sees if any are in df if any are they get yote
- def delete_cols(self, df, del_cols):
- # Delete unused columns if they exist
- for thisCol in del_cols:
- if thisCol in df.columns:
- df.drop(thisCol, axis=1, inplace=True)
- #print(thisCol + "dropped")
- #/ abs value of deviations basically useless and shouldnt be in base but I dont want to restructure so here it stays
- def take_abs_of_devs(self, df):
- for dev_col in self.deviation_cols:
- df[dev_col] = pd.DataFrame(np.absolute(df[dev_col].values))
- #*/
- #creates file based on cleaned up frame
- def create_file(self, result_dir, fileN, df):
- resultFrame = df
- if resultFrame.shape[0] > 0:
- k = result_dir+"\\"+fileN[:-4] +"_.csv"
- resultFrame.to_csv(k)
- #prepares lifetime value columns, but I didnt really understand that when I wrote it
- def prepare_col(self, name, zero, df):
- if(name in df.columns):
- print('ayayay')
- if df[name].dtype == "int64":
- n = np.ediff1d(df[name].values)
- n = np.append(zero, n)
- n = n.clip(min=0, max=1)
- df[name] = pd.DataFrame(n)
- elif df[name].values[0].isdigit():
- name_int = [int(x) for x in df[name].values]
- n = np.ediff1d(name_int)
- n = np.append(zero, n)
- n = n.clip(min=0, max=1)
- df[name] = pd.DataFrame(n)
- #prepares an array of lifetime value cols
- def prepare_arr_of_cols(self, zero, cols_to_fix, df):
- for col in cols_to_fix:
- self.prepare_col(col, zero, df)
- #finds files that are different between result dir and data directory
- def find_different(self, data_path, result_dir):
- already_processed = []
- file_names = []
- ##### Make sure to read only the csv files in directory
- with os.scandir(result_dir) as listOfEntries:
- for entry in listOfEntries:
- if entry.name[-5].isdigit():
- already_processed.append(entry.name[:-6] + ".csv")
- else :
- already_processed.append(entry.name[:-5] + ".csv")
- with os.scandir(data_path) as listOfEntries:
- for entry in listOfEntries:
- # print all entries that are files
- if entry.is_file() and entry.name[-4:] == ".csv":
- file_names.append(entry.name)
- file_names = sorted(list(set(file_names) - set(already_processed)))
- return file_names
- #names a multiple unit NAMEONE and NAMETWO post split
- def delta_t(self, temp_cols, df):
- df[temp_cols["out"]] = df[temp_cols["out"]].astype(float)
- df[temp_cols["in"]] = df[temp_cols["in"]].astype(float)
- return(df[temp_cols["out"]] - df[temp_cols["in"]])
- def unit_name_multiple(self, data_path, df, i):
- location = os.path.basename(os.path.normpath(data_path))
- name = location.split(" ")[0] + num2words(i)
- df["Unit_Name"] = name.upper()
- df["Location"] = location.upper()
- df["Category"] = "Field"
- def unit_name_from_path(self, data_path, df):
- name = os.path.basename(os.path.normpath(data_path))
- name = name.split(" ")[0]
- df["Unit_Name"] = name.upper()
- df["Location"] = name.upper()
- df["Category"] = "Field"
- if not self.temp_cols == None:
- df["Delta_T"] = self.delta_t(self.temp_cols, df)
- #extends beyond just multiples
- def del_row_with_dashes(self, df):
- for col in df.columns[2:]:
- df.drop(df.loc[df[col] == '---'].index, inplace =True)
- def create_multiple_file(self, df, result_dir, fileN, i, intended_cols_i, data_path):
- try:
- split_df = df[intended_cols_i].copy()
- split_df.columns = self.intended_cols[:-1]
- self.del_row_with_dashes(split_df)
- zero = np.array([0])
- self.prepare_arr_of_cols(zero, self.cols_to_fix, split_df)
- if not self.temp_cols == None:
- split_df["Delta_T"] = self.delta_t(self.temp_cols, split_df)
- self.unit_name_multiple(data_path, split_df, i)
- self.binary_col_array(self.binary_cols, split_df)
- split_df["Delta_T"] = self.delta_t(self.temp_cols, split_df)
- k = result_dir+"\\"+fileN[:-4] +"_" + str(i) +".csv"
- split_df.to_csv(k)
- except Exception as e:
- pass
- #meant to format multiple cols but would probably just end up being overwritten every single module so its abstract
- def format_multiple_cols(self, df, fileN, result_dir, data_path, i, last_int):
- colnames = df.columns.values.tolist()
- intended_cols_i =['Time', 'Date'] + [col + "_" + str(i) for col in self.intended_cols[2:-1]]
- print(len(intended_cols_i))
- #if not self.binary_cols == None:
- #intended_cols_i.append("SW_VERSN")
- #df.columns = intended_cols_i
- zero = np.array([0])
- #self.df_to_string(alarm_i, df)
- self.unit_name_multiple(data_path, df, i)
- self.create_multiple_file(df, result_dir, fileN, i, intended_cols_i, data_path)
- if i < int(last_int):
- i+= 1
- self.format_multiple_cols(df, fileN, result_dir, data_path, i, last_int)
- #I should probably move this into my data files instead of the module since it is the main driving fnction
- def format_cols(self, cols, df, fileN, result_dir, data_path):
- if not self.alarm_name == None:
- self.df_to_string(self.alarm_name, df)
- if df.shape[0] > 1: #Ignore changing the files with only one row
- cols = df.columns
- #retrieve unit name
- if(self.from_path == True):
- self.unit_name_from_path(data_path, df)
- else:
- self.unit_name(df.columns, fileN, df)
- # Fix the Gallons Columns, Successful Ignitions, Failed Ignitions
- # ,Flame Failures, Burner Minutes
- zero = np.array([0])
- print(fileN)
- self.prepare_arr_of_cols(zero, self.cols_to_fix, df)
- # Taking the Absolute Value of the both the Deviations for easy
- # analysis
- if not self.deviation_cols == None:
- self.take_abs_of_devs(df)
- if not self.binary_cols == None:
- self.binary_col_array(self.binary_cols, df)
- print(fileN, len(cols), cols[-1], cols[0])
- #Save only the files that contain info
- self.create_file(result_dir, fileN, df)
- else:
- print("\n\n\n*******\n", fileN, " = not considered in analysis\n because it has only 1 line of data")
- #another main driving function of the module, meant to loop through the new files and process them
- def check_if_multiple(self, df, last_int):
- if(last_int.isdigit()):
- if(int(last_int) > 1):
- return True
- def read_files(self, data_path, result_dir):
- min_cols = len(self.intended_cols)
- file_names = self.find_different(data_path, result_dir)
- for fileN in file_names:
- #create df
- try:
- df = pd.read_csv(data_path + "\\" + fileN
- )
- except Exception as e:
- continue
- if not self.del_cols== None:
- self.delete_cols(df, self.del_cols)
- cols = df.columns
- colnames = df.columns.values.tolist()
- last_col = colnames[-12]
- last_int = last_col[-1]
- if( len(cols) == min_cols):
- df.columns = self.intended_cols
- self.format_cols(cols, df, fileN, result_dir, data_path)
- elif(self.check_if_multiple(df, last_int)):
- self.format_multiple_cols(df, fileN, result_dir, data_path, 1, last_int)
- #ipdb.set_trace()
- #main main driving function, starts the other two
- def main(self, data_path, result_dir):
- config_path = self.path + "/Config"
- login_path = config_path + "/login.yaml"
- path_path = config_path + "/path.yaml"
- result_dir = self.path + result_dir
- if os.path.exists(result_dir) == False:
- os.mkdir(result_dir)
- self.read_files(data_path, result_dir)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement