SHARE
TWEET

Untitled

a guest Sep 15th, 2019 155 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import yaml
  2. from tkinter import messagebox, Label, Button, FALSE, Tk, Entry, Checkbutton, BooleanVar, StringVar
  3. import os
  4. from datetime import datetime
  5. from set_config import set_config
  6. from ftplib import FTP
  7. import pandas as pd
  8. from dateutil.parser import *
  9. from num2words import num2words
  10. import numpy as np
  11. #class Preprocessing
  12.     # unit name extraction
  13. class from_ftp:
  14. #retreive files from ftp server, with login, adress, and path from yaml file
  15.     path = ""
  16.     login_path = ""
  17.     remote_path = ""
  18.     path_path = ""
  19.     def __init__(self, **kwargs):
  20.         self.start_str = kwargs.get('start_str')
  21.         self.path = kwargs.get('path')
  22.         self.str_date_start = kwargs.get('str_date_start')
  23.         self.ftp = kwargs.get("ftp")
  24.         self.remote_path = kwargs.get("remote_path")
  25.         if not (os.path.exists(self.path)):
  26.             os.mkdir(self.path)
  27.         config_path = self.path + "/Config"
  28.         if not(os.path.exists(config_path)):
  29.             config = set_config(config_path)
  30.             config.setup()
  31.         self.login_path = config_path + "/login.yaml"
  32.         self.data_path = kwargs.get('data_path')
  33.         self.path_path = config_path + "/path.yaml"
  34.     #checks: file type, whether the file has the right start, and if it isnt from today
  35.     def get_file_date(self, name):
  36.         split = name.split("_")
  37.         day = split[-1][:-4]
  38.         date = parse(split[-3] + "-" + split[-2] + "-" + day)
  39.         return date.date()
  40.     def filter_remote_files(self):
  41.         relevant_files = []
  42.         #date_start = datetime.now().date()
  43.         now = datetime.now().date()
  44.         #if self.str_date_start != None:
  45.         date_start = parse(self.str_date_start).date()
  46.         #else:
  47.             #data_start = parse('2017-1-1').date()
  48.         for name in self.ftp.nlst():
  49.             if(name.startswith(self.start_str)):
  50.                 file_date = self.get_file_date(name)
  51.                 if(file_date < now and file_date >= date_start ):
  52.                     relevant_files.append(name)
  53.         return relevant_files
  54.     #retrieves the files from remote server
  55.     def retrieve_files(self):
  56.         if(self.data_path == None):
  57.             with open(self.path_path, 'r') as stream:
  58.                 data_dict = yaml.safe_load(stream)
  59.             data_path = data_dict.get('File')
  60.         else:
  61.             data_path = self.data_path
  62.         data_path_files = []
  63.         h_remote_files = []
  64.  
  65.        
  66.        
  67.         print('BUILDING LOCAL DIR FILE LIST...')
  68.         for file_name in os.listdir(data_path):
  69.             data_path_files.append(file_name) # populate local dir list
  70.         cmdcmd = 'CWD ' + self.remote_path
  71.         self.ftp.sendcmd(cmdcmd)
  72.         print('BUILDING REMOTE DIR FILE LIST...\n')
  73.         h_remote_files = self.filter_remote_files()
  74.  
  75.         h_diff = sorted(list(set(h_remote_files) - set(data_path_files))) # difference between two lists
  76.  
  77.         for h in h_diff:
  78.             with open(os.path.join(data_path,h), 'wb') as ftpfile:
  79.                 s = '0'
  80.                 print('File: ' + h)
  81.                 s = self.ftp.retrbinary('RETR ' + h, lambda d: ftpfile.write(d)) # retrieve file
  82.                 if str(s).startswith('226'): # comes from ftp status: '226 Transfer complete.'
  83.                     print ('\nOK\n') # print 'OK' if transfer was successful
  84.                 else:
  85.                     print (s) # if error, print retrbinary's return
  86.     #try to login with current user info
  87.     #try to login with current user info
  88.     def try_login(self, username, password):
  89.         print("Trying to login...")
  90.         print (username)
  91.         try:
  92.             self.ftp.login(username, password)
  93.         except Exception as e:
  94.             messagebox.showinfo("-- ERROR --", "Please enter valid infomation!", icon="warning")
  95.     #enter user info from yaml file
  96.  
  97.     #main function, establishes shared variables, and makes sure the program can run, and all config is setup
  98.     def ftp_sync(self):
  99.  
  100.         username = "" #mutable variable for username
  101.         password = ""#mutable var for pwd
  102.         with open(self.login_path, 'r') as stream:
  103.             login_info = yaml.safe_load(stream)
  104.             username = login_info.get("Username")
  105.             password = login_info.get("Password")
  106.             print(username)
  107.         self.try_login(username, password)
  108.         #if login is sucsessful retrieve files
  109.         self.retrieve_files()
  110. class Preprocessing_Base:
  111.     # path, alarm_name, del_cols, intended_cols, expected_cols, cols_to_fix, deviation_cols, min_cols
  112.     def __init__(self,**kwargs):
  113.         self.path = kwargs.get('path')
  114.         self.from_path = kwargs.get('from_path')
  115.         self.alarm_name = kwargs.get('alarm_name')
  116.         self.del_cols = kwargs.get('del_cols')
  117.         self.intended_cols = kwargs.get('intended_cols')
  118.         self.cols_to_fix = kwargs.get('cols_to_fix')
  119.         self.deviation_cols = kwargs.get('deviation_cols')
  120.         self.one_strs = kwargs.get("one_strs")
  121.         self.temp_cols = kwargs.get("temp_cols")
  122.         self.zero_strs = kwargs.get("zero_strs")
  123.         self.binary_cols = kwargs.get("binary_cols")
  124.     #derives unit_name from filename
  125.     def unit_name(self, cols, file_n, df):
  126.         if len(cols) == len(self.intended_cols):
  127.             # Add the Unit_Names for the file_names
  128.             unitName = file_n.split("_")[0]
  129.             df["Unit_Name"] = unitName.upper()
  130.         else:
  131.             df = df.drop(df.columns[len(self.intended_cols):],axis=1)
  132.             # Add the Unit_Names for the file_names
  133.             unitName = file_n.split("_")[0]
  134.             df["Unit_Name"] = unitName.upper()
  135.         return unitName
  136.         #//
  137.     #sets values to string for entire column
  138.     def df_to_string(self, column, df):
  139.         #alarm strin
  140.         df[column] = df[column].astype(str)
  141.         #/
  142.     #changes zeroes and ones to zerostr and onestr
  143.     def binary_to_string(self, binary_col, df):
  144.        
  145.         if(binary_col in df.columns):
  146.             self.df_to_string(binary_col, df)
  147.  
  148.             if(binary_col[-2] == '_'):
  149.                 key_col = binary_col[:-2]
  150.             else:
  151.                 key_col = binary_col
  152.             n = []
  153.             for val in df[binary_col].values:
  154.                 if(val == '1'):
  155.                     n.append(self.one_strs.get(key_col))
  156.                 elif(val == '0'):
  157.                     n.append(self.zero_strs.get(key_col))
  158.                 else:
  159.                     n = df[binary_col]
  160.                     break
  161.             df[binary_col] = pd.DataFrame(n)
  162.     def binary_col_array(self, binary_cols, df):
  163.         for binary_col in binary_cols:
  164.             self.binary_to_string(binary_col, df)
  165.     #old more specific code I dont feel like replacingough del_cols and sees if any are in df if any are they get yote
  166.     def delete_cols(self, df, del_cols):
  167.         # Delete unused columns if they exist
  168.         for thisCol in del_cols:
  169.             if thisCol in df.columns:
  170.                 df.drop(thisCol, axis=1, inplace=True)
  171.                         #print(thisCol + "dropped")
  172.         #/ abs value of deviations basically useless and shouldnt be in base but I dont want to restructure so here it stays
  173.     def take_abs_of_devs(self, df):
  174.         for dev_col in self.deviation_cols:
  175.             df[dev_col] = pd.DataFrame(np.absolute(df[dev_col].values))
  176.         #*/
  177.     #creates file based on cleaned up frame
  178.     def create_file(self, result_dir, fileN, df):
  179.         resultFrame = df    
  180.         if resultFrame.shape[0] > 0:
  181.             k = result_dir+"\\"+fileN[:-4] +"_.csv"
  182.             resultFrame.to_csv(k)
  183.     #prepares lifetime value columns, but I didnt really understand that when I wrote it
  184.     def prepare_col(self, name, zero, df):
  185.         if(name in df.columns):
  186.             print('ayayay')
  187.             if df[name].dtype == "int64":
  188.                 n = np.ediff1d(df[name].values)
  189.                 n = np.append(zero, n)
  190.                 n = n.clip(min=0, max=1)
  191.  
  192.                 df[name] = pd.DataFrame(n)                
  193.             elif df[name].values[0].isdigit():
  194.                 name_int = [int(x) for x in df[name].values]
  195.                 n = np.ediff1d(name_int)
  196.                 n = np.append(zero, n)
  197.                 n = n.clip(min=0, max=1)
  198.  
  199.                 df[name] = pd.DataFrame(n)
  200.     #prepares an array of lifetime value cols
  201.     def prepare_arr_of_cols(self, zero, cols_to_fix, df):
  202.         for col in cols_to_fix:
  203.             self.prepare_col(col, zero, df)
  204.     #finds files that are different between result dir and data directory
  205.     def find_different(self, data_path, result_dir):
  206.         already_processed = []    
  207.         file_names = []
  208.         ##### Make sure to read only the csv files in directory
  209.         with os.scandir(result_dir) as listOfEntries:
  210.             for entry in listOfEntries:
  211.                 if entry.name[-5].isdigit():
  212.                     already_processed.append(entry.name[:-6] + ".csv")
  213.                 else :
  214.                     already_processed.append(entry.name[:-5] + ".csv")
  215.         with os.scandir(data_path) as listOfEntries:
  216.            for entry in listOfEntries:
  217.                # print all entries that are files
  218.  
  219.                if entry.is_file() and entry.name[-4:] == ".csv":
  220.  
  221.                    file_names.append(entry.name)
  222.  
  223.         file_names = sorted(list(set(file_names) - set(already_processed)))
  224.         return file_names
  225.     #names a multiple unit NAMEONE and NAMETWO post split
  226.     def delta_t(self, temp_cols, df):
  227.         df[temp_cols["out"]] = df[temp_cols["out"]].astype(float)
  228.         df[temp_cols["in"]] = df[temp_cols["in"]].astype(float)
  229.  
  230.         return(df[temp_cols["out"]] - df[temp_cols["in"]])
  231.     def unit_name_multiple(self, data_path, df, i):
  232.         location = os.path.basename(os.path.normpath(data_path))
  233.         name = location.split(" ")[0] + num2words(i)
  234.         df["Unit_Name"] = name.upper()
  235.  
  236.         df["Location"] = location.upper()
  237.         df["Category"] = "Field"
  238.  
  239.     def unit_name_from_path(self, data_path, df):
  240.         name = os.path.basename(os.path.normpath(data_path))
  241.         name = name.split(" ")[0]
  242.         df["Unit_Name"] = name.upper()
  243.         df["Location"] = name.upper()
  244.         df["Category"] = "Field"
  245.         if not self.temp_cols == None:
  246.             df["Delta_T"] = self.delta_t(self.temp_cols, df)
  247.  
  248.     #extends beyond just multiples
  249.     def del_row_with_dashes(self, df):
  250.         for col in df.columns[2:]:
  251.             df.drop(df.loc[df[col] == '---'].index, inplace =True)
  252.  
  253.     def create_multiple_file(self, df, result_dir, fileN, i, intended_cols_i, data_path):
  254.         try:
  255.             split_df = df[intended_cols_i].copy()
  256.             split_df.columns = self.intended_cols[:-1]
  257.             self.del_row_with_dashes(split_df)
  258.             zero = np.array([0])
  259.  
  260.             self.prepare_arr_of_cols(zero, self.cols_to_fix, split_df)
  261.             if not self.temp_cols == None:
  262.                 split_df["Delta_T"] = self.delta_t(self.temp_cols, split_df)
  263.             self.unit_name_multiple(data_path, split_df, i)
  264.             self.binary_col_array(self.binary_cols, split_df)
  265.  
  266.             split_df["Delta_T"] = self.delta_t(self.temp_cols, split_df)
  267.             k = result_dir+"\\"+fileN[:-4] +"_" + str(i) +".csv"
  268.             split_df.to_csv(k)
  269.         except Exception as e:
  270.             pass
  271.     #meant to format multiple cols but would probably just end up being overwritten every single module so its abstract
  272.     def format_multiple_cols(self, df, fileN, result_dir, data_path, i, last_int):
  273.         colnames = df.columns.values.tolist()
  274.         intended_cols_i =['Time', 'Date'] + [col + "_" + str(i) for col in self.intended_cols[2:-1]]
  275.         print(len(intended_cols_i))
  276.  
  277.         #if not self.binary_cols == None:
  278.         #intended_cols_i.append("SW_VERSN")
  279.         #df.columns = intended_cols_i
  280.         zero = np.array([0])
  281.         #self.df_to_string(alarm_i, df)
  282.         self.unit_name_multiple(data_path, df, i)
  283.         self.create_multiple_file(df, result_dir, fileN, i, intended_cols_i, data_path)
  284.         if i < int(last_int):
  285.             i+= 1
  286.             self.format_multiple_cols(df, fileN, result_dir, data_path, i, last_int)
  287.     #I should probably move this into my data files instead of the module since it is the main driving fnction
  288.     def format_cols(self, cols, df, fileN, result_dir, data_path):
  289.         if not self.alarm_name == None:
  290.             self.df_to_string(self.alarm_name, df)
  291.         if df.shape[0] > 1: #Ignore changing the files with only one row
  292.             cols = df.columns
  293.             #retrieve unit name
  294.             if(self.from_path == True):
  295.                 self.unit_name_from_path(data_path, df)
  296.             else:
  297.                 self.unit_name(df.columns, fileN, df)
  298.  
  299.             # Fix the Gallons Columns, Successful Ignitions, Failed Ignitions
  300.             # ,Flame Failures, Burner Minutes
  301.             zero = np.array([0])
  302.             print(fileN)
  303.             self.prepare_arr_of_cols(zero, self.cols_to_fix, df)
  304.             # Taking the Absolute Value of the both the Deviations for easy
  305.             # analysis
  306.             if not self.deviation_cols == None:
  307.                 self.take_abs_of_devs(df)
  308.             if not self.binary_cols == None:
  309.                 self.binary_col_array(self.binary_cols, df)
  310.             print(fileN, len(cols), cols[-1], cols[0])
  311.  
  312.             #Save only the files that contain info
  313.             self.create_file(result_dir, fileN, df)
  314.         else:
  315.             print("\n\n\n*******\n", fileN, " = not considered in analysis\n because it has only 1 line of data")
  316.     #another main driving function of the module, meant to loop through the new files and process them
  317.     def check_if_multiple(self, df, last_int):
  318.         if(last_int.isdigit()):
  319.             if(int(last_int) > 1):
  320.                 return True
  321.     def read_files(self, data_path, result_dir):
  322.         min_cols = len(self.intended_cols)
  323.         file_names = self.find_different(data_path, result_dir)
  324.         for fileN in file_names:
  325.             #create df
  326.             try:
  327.                 df = pd.read_csv(data_path + "\\" + fileN
  328.                     )      
  329.             except Exception as e:
  330.                 continue
  331.             if not self.del_cols== None:
  332.                 self.delete_cols(df, self.del_cols)
  333.  
  334.             cols = df.columns
  335.             colnames = df.columns.values.tolist()
  336.             last_col = colnames[-12]
  337.             last_int = last_col[-1]
  338.             if( len(cols) == min_cols):
  339.                 df.columns = self.intended_cols
  340.                 self.format_cols(cols, df, fileN, result_dir, data_path)
  341.             elif(self.check_if_multiple(df, last_int)):
  342.                 self.format_multiple_cols(df, fileN, result_dir, data_path, 1, last_int)
  343.                        
  344.    
  345.             #ipdb.set_trace()
  346.     #main main driving function, starts the other two
  347.     def main(self, data_path, result_dir):
  348.         config_path = self.path + "/Config"
  349.         login_path = config_path + "/login.yaml"
  350.         path_path = config_path + "/path.yaml"
  351.         result_dir = self.path + result_dir
  352.         if os.path.exists(result_dir) == False:
  353.             os.mkdir(result_dir)
  354.         self.read_files(data_path, result_dir)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Not a member of Pastebin yet?
Sign Up, it unlocks many cool features!
 
Top