Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Aim: - Creating data model using Cassandra
- Step 1: - run ‘cassandra.bat’ file as administrator to start the Cassandra server
- Step 2: - Double click to execute the ‘cqlsh’ file in Python
- Step 3: - Create a Data model as following:
- cqlsh> create keyspace key_st_041 WITH replication={'class':'SimpleStrategy','replication_factor':3};
- cqlsh> use key_st_041;
- cqlsh:key_st_041> create table dept(dept_id int PRIMARY KEY,dept_name text,dept_location text);
- cqlsh:key_st_041> create table emp(emp_id int PRIMARY KEY,emp_name text,dept_id int,emp_email text,emp_phone text);
- cqlsh:key_st_041> Insert into dept (dept_id,dept_name,dept_location) values (1001,'Accounts','Mumbai');
- cqlsh:key_st_041> Insert into dept (dept_id,dept_name,dept_location) values (1002,'Marketing','Chennai');
- cqlsh:key_st_041> Insert into dept (dept_id,dept_name,dept_location) values (1002,'HR','Banglore');
- cqlsh:key_st_041> Insert into emp (emp_id,emp_name,dept_id,emp_email,emp_phone) values (1001,'Siddhi Thakkar',1001,'s@email.com','1234567890');
- cqlsh:key_st_041> Insert into emp (emp_id,emp_name,dept_id,emp_email,emp_phone) values (1002,'Riddhi Thakkar',1002,'r@email.com','8796541230');
- cqlsh:key_st_041> Insert into emp (emp_id,emp_name,dept_id,emp_email,emp_phone) values (1003,'Ruchi Thakkar',1003,'rt@email.com','8796541889');
- cqlsh:key_st_041>
- cqlsh:key_st_041> select * from emp;
- emp_id | dept_id | emp_email | emp_name | emp_phone
- --------+---------+--------------+----------------+------------
- 1001 | 1001 | s@email.com | Siddhi Thakkar | 1234567890
- 1003 | 1003 | rt@email.com | Ruchi Thakkar | 8796541889
- 1002 | 1002 | r@email.com | Riddhi Thakkar | 8796541230
- (3 rows)
- cqlsh:key_st_041> select * from dept;
- dept_id | dept_location | dept_name
- ---------+---------------+-----------
- 1001 | Mumbai | Accounts
- 1002 | Banglore | HR
- (2 rows)
- cqlsh:key_st_041> update dept set dept_name='Human Resources' where dept_id=1002;
- cqlsh:key_st_041> select * from dept;
- dept_id | dept_location | dept_name
- ---------+---------------+-----------------
- 1001 | Mumbai | Accounts
- 1002 | Banglore | Human Resources
- (2 rows)
- cqlsh:key_st_041> delete from emp where emp_id=1003;
- cqlsh:key_st_041> select * from emp;
- emp_id | dept_id | emp_email | emp_name | emp_phone
- --------+---------+-------------+----------------+------------
- 1001 | 1001 | s@email.com | Siddhi Thakkar | 1234567890
- 1002 | 1002 | r@email.com | Riddhi Thakkar | 8796541230
- (2 rows)
- cqlsh:key_st_041> alter table emp add emp_location text;
- cqlsh:key_st_041> select * from emp;
- emp_id | dept_id | emp_email | emp_location | emp_name | emp_phone
- --------+---------+-------------+--------------+----------------+------------
- 1001 | 1001 | s@email.com | null | Siddhi Thakkar | 1234567890
- 1002 | 1002 | r@email.com | null | Riddhi Thakkar | 8796541230
- (2 rows)
- cqlsh:key_st_041> alter table emp drop emp_location text;
- SyntaxException: line 1:34 mismatched input 'text' expecting EOF (alter table emp drop emp_location [text]...)
- cqlsh:key_st_041> alter table emp drop emp_location;
- cqlsh:key_st_041> select * from emp;
- emp_id | dept_id | emp_email | emp_name | emp_phone
- --------+---------+-------------+----------------+------------
- 1001 | 1001 | s@email.com | Siddhi Thakkar | 1234567890
- 1002 | 1002 | r@email.com | Riddhi Thakkar | 8796541230
- (2 rows)
- cqlsh:key_st_041>
- ------------------------------------------------------------------------------------------
- A. Text delimited CSVto HORUS format.
- Code:
- import panda as pd
- InputData=pd.read_csv(r'Desktop\Country.csv',encoding="latin-1")
- print('Input Data Values ===================================')
- print(InputData)
- ProcessData.rename(columns={'Year': 'year'}, inplace=True)
- ProcessData.rename(columns={'ccTLD': 'cctld'}, inplace=True)
- ProcessData = InputData
- ProcessData.to_csv(r"E:\country2.csv")
- ----------------------------------------------------------------------------
- B. XML to HORUS Format
- Code:
- # Utility Start XML to HORUS =================================
- # Standard Tools
- import pandas as pd
- import xml.etree.ElementTree as ET
- def df2xml(data):
- header = data.columns
- root = ET.Element('root')
- for row in range(data.shape[0]):
- entry = ET.SubElement(root,'entry')
- for index in range(data.shape[1]):
- schild=str(header[index])
- child = ET.SubElement(entry, schild)
- if str(data[schild][row]) != 'nan':
- child.text = str(data[schild][row])
- else:
- child.text = 'n/a'
- entry.append(child)
- result = ET.tostring(root)
- return result
- def xml2df(xml_data):
- root = ET.XML(xml_data)
- all_records = []
- for i, child in enumerate(root):
- record = {}
- for subchild in child:
- record[subchild.tag] = subchild.text
- all_records.append(record)
- return pd.DataFrame(all_records)
- sInputFileName='C:/VKHCG/05-DS/9999-Data/Country_Code.xml'
- InputData = open(sInputFileName).read()
- print('=====================================================')
- print('Input Data Values ===================================')
- print('=====================================================')
- print(InputData)
- print('=====================================================')
- ProcessDataXML=InputData
- # XML to Data Frame
- ProcessData=xml2df(ProcessDataXML)
- # Remove columns ISO-2-Code and ISO-3-CODE
- ProcessData.drop('ISO-2-CODE', axis=1,inplace=True)
- ProcessData.drop('ISO-3-Code', axis=1,inplace=True)
- # Rename Country and ISO-M49
- ProcessData.rename(columns={'Country': 'CountryName'}, inplace=True)
- ProcessData.rename(columns={'ISO-M49': 'CountryNumber'}, inplace=True)
- # Set new Index
- ProcessData.set_index('CountryNumber', inplace=True)
- # Sort data by CurrencyNumber
- ProcessData.sort_values('CountryName', axis=0, ascending=False, inplace=True)
- print('=====================================================')
- print('Process Data Values =================================')
- print('=====================================================')
- print(ProcessData)
- print('=====================================================')
- OutputData=ProcessData
- sOutputFileName='C:/VKHCG/05-DS/9999-Data/HORUS-XML-Country.csv'
- OutputData.to_csv(sOutputFileName, index = False)
- print('=====================================================')
- print('XML to HORUS - Done')
- print('=====================================================')
- -------------------------------------------------------------------
- C. JSON to HORUS Format
- Code:
- # Utility Start JSON to HORUS =================================
- # Standard Tools
- #=============================================================
- import pandas as pd
- # Input Agreement ============================================
- sInputFileName='C:/VKHCG/05-DS/9999-Data/Country_Code.json'
- InputData=pd.read_json(sInputFileName, orient='index', encoding="latin-1")
- print('Input Data Values ===================================')
- print(InputData)
- print('=====================================================')
- # Processing Rules ===========================================
- ProcessData=InputData
- # Remove columns ISO-2-Code and ISO-3-CODE
- ProcessData.drop('ISO-2-CODE', axis=1,inplace=True)
- ProcessData.drop('ISO-3-Code', axis=1,inplace=True)
- # Rename Country and ISO-M49
- ProcessData.rename(columns={'Country': 'CountryName'}, inplace=True)
- ProcessData.rename(columns={'ISO-M49': 'CountryNumber'}, inplace=True)
- # Set new Index
- ProcessData.set_index('CountryNumber', inplace=True)
- # Sort data by CurrencyNumber
- ProcessData.sort_values('CountryName', axis=0, ascending=False, inplace=True)
- print('Process Data Values =================================')
- print(ProcessData)
- print('=====================================================')
- # Output Agreement ===========================================
- OutputData=ProcessData
- sOutputFileName='c:/VKHCG/05-DS/9999-Data/HORUS-JSON-Country.csv'
- OutputData.to_csv(sOutputFileName, index = False)
- print('JSON to HORUS - Done')
- -------------------------------------------------------------------
- D. MySql Database to HORUS Format
- Code:
- import pandas as pd
- import sqlite3 as sq
- sInputFileName='C:/VKHCG/05-DS/9999-Data/utility.db'
- sInputTable='Country_Code'
- conn = sq.connect(sInputFileName)
- sSQL='select * FROM ' + sInputTable + ';'
- InputData=pd.read_sql_query(sSQL, conn)
- print('Input Data Values ===================================')
- print(InputData)
- print('=====================================================')
- ProcessData=InputData
- ProcessData.drop('ISO-2-CODE', axis=1,inplace=True)
- ProcessData.drop('ISO-3-Code', axis=1,inplace=True)
- ProcessData.rename(columns={'Country': 'CountryName'}, inplace=True)
- ProcessData.rename(columns={'ISO-M49': 'CountryNumber'}, inplace=True)
- ProcessData.set_index('CountryNumber', inplace=True)
- ProcessData.sort_values('CountryName', axis=0, ascending=False, inplace=True)
- print('Process Data Values =================================')
- print(ProcessData)
- print('=====================================================')
- OutputData=ProcessData
- sOutputFileName='C:/VKHCG/05-DS/9999-Data/HORUS-CSV-Country.csv'
- OutputData.to_csv(sOutputFileName, index = False)
- print('Database to HORUS - Done')
- -------------------------------------------------------------------
- E. Picture (JPEG) to HORUS Format (Use SPYDER to run this program)
- Code:
- from scipy.misc import imread
- import pandas as pd
- import matplotlib.pyplot as plt
- import numpy as np
- sInputFileName='C:/VKHCG/05-DS/9999-Data/Angus.jpg'
- InputData = imread(sInputFileName, flatten=False, mode='RGBA')
- print('Input Data Values ===================================')
- print('X: ',InputData.shape[0])
- print('Y: ',InputData.shape[1])
- print('RGBA: ', InputData.shape[2])
- print('=====================================================')
- ProcessRawData=InputData.flatten()
- y=InputData.shape[2] + 2
- x=int(ProcessRawData.shape[0]/y)
- ProcessData=pd.DataFrame(np.reshape(ProcessRawData, (x, y)))
- sColumns= ['XAxis','YAxis','Red', 'Green', 'Blue','Alpha']
- ProcessData.columns=sColumns
- ProcessData.index.names =['ID']
- print('Rows: ',ProcessData.shape[0])
- print('Columns :',ProcessData.shape[1])
- print('=====================================================')
- print('Process Data Values =================================')
- print('=====================================================')
- plt.imshow(InputData)
- plt.show()
- print('=====================================================')
- OutputData=ProcessData
- print('Storing File')
- sOutputFileName='C:/VKHCG/05-DS/9999-Data/HORUS-Picture.csv'
- OutputData.to_csv(sOutputFileName, index = False)
- print('=====================================================')
- print('Picture to HORUS - Done')
- print('=====================================================')
- -------------------------------------------------------------------
- F. Video to HORUS Format
- Code:
- Movie to Frames
- import os
- import shutil
- import cv2
- sInputFileName='C:/VKHCG/05-DS/9999-Data/dog.mp4'
- sDataBaseDir='C:/VKHCG/05-DS/9999-Data/temp'
- if os.path.exists(sDataBaseDir):
- shutil.rmtree(sDataBaseDir)
- if not os.path.exists(sDataBaseDir):
- os.makedirs(sDataBaseDir)
- print('=====================================================')
- print('Start Movie to Frames')
- print('=====================================================')
- vidcap = cv2.VideoCapture(sInputFileName)
- success,image = vidcap.read()
- count = 0
- while success:
- success,image = vidcap.read()
- sFrame=sDataBaseDir + str('/dog-frame-' + str(format(count, '04d'))+ '.jpg')
- print('Extracted: ', sFrame)
- cv2.imwrite(sFrame, image)
- if os.path.getsize(sFrame) == 0:
- count += -1
- os.remove(sFrame)
- print('Removed: ', sFrame)
- if cv2.waitKey(10) == 27: # exit if Escape is hit
- break
- count += 1
- print('=====================================================')
- print('Generated : ', count, ' Frames')
- print('=====================================================')
- print('Movie to Frames HORUS - Done')
- print('=====================================================')
- -------------------------------------------------------------------
- Frames to Horus (Use SPYDER to run this program)
- from scipy.misc import imread
- import pandas as pd
- import matplotlib.pyplot as plt
- import numpy as np
- import os
- # Input Agreement ============================================
- sDataBaseDir='C:/VKHCG/05-DS/9999-Data/temp'
- f=0
- for file in os.listdir(sDataBaseDir):
- if file.endswith(".jpg"):
- f += 1
- sInputFileName=os.path.join(sDataBaseDir, file)
- print('Process : ', sInputFileName)
- InputData = imread(sInputFileName, flatten=False, mode='RGBA')
- print('Input Data Values ===================================')
- print('X: ',InputData.shape[0])
- print('Y: ',InputData.shape[1])
- print('RGBA: ', InputData.shape[2])
- print('=====================================================')
- # Processing Rules ===========================================
- ProcessRawData=InputData.flatten()
- y=InputData.shape[2] + 2
- x=int(ProcessRawData.shape[0]/y)
- ProcessFrameData=pd.DataFrame(np.reshape(ProcessRawData, (x, y)))
- ProcessFrameData['Frame']=file
- print('=====================================================')
- print('Process Data Values =================================')
- print('=====================================================')
- plt.imshow(InputData)
- plt.show()
- if f == 1:
- ProcessData=ProcessFrameData
- else:
- ProcessData=ProcessData.append(ProcessFrameData)
- if f > 0:
- sColumns= ['XAxis','YAxis','Red', 'Green', 'Blue','Alpha','FrameName']
- ProcessData.columns=sColumns
- print('=====================================================')
- ProcessFrameData.index.names =['ID']
- print('Rows: ',ProcessData.shape[0])
- print('Columns :',ProcessData.shape[1])
- print('=====================================================')
- # Output Agreement ===========================================
- OutputData=ProcessData
- print('Storing File')
- sOutputFileName='C:/VKHCG/05-DS/9999-Data/HORUS-Movie-Frame.csv'
- OutputData.to_csv(sOutputFileName, index = False)
- print('=====================================================')
- print('Processed ; ', f,' frames')
- PSIT1P2 ~~~~~ Data Science Practical
- M. Sc. [Information Technology] SEMESTER ~ I Teacher’s Reference Manual
- 21
- print('=====================================================')
- print('Movie to HORUS - Done')
- print('=====================================================')
- -------------------------------------------------------------------
- G. Audio to HORUS Format
- Code:
- from scipy.io import wavfile
- import pandas as pd
- import matplotlib.pyplot as plt
- import numpy as np
- #=============================================================
- def show_info(aname, a,r):
- print ('----------------')
- print ("Audio:", aname)
- print ('----------------')
- print ("Rate:", r)
- print ('----------------')
- print ("shape:", a.shape)
- print ("dtype:", a.dtype)
- print ("min, max:", a.min(), a.max())
- print ('----------------')
- plot_info(aname, a,r)
- #=============================================================
- def plot_info(aname, a,r):
- sTitle= 'Signal Wave - '+ aname + ' at ' + str(r) + 'hz'
- plt.title(sTitle)
- sLegend=[]
- for c in range(a.shape[1]):
- sLabel = 'Ch' + str(c+1)
- sLegend=sLegend+[str(c+1)]
- plt.plot(a[:,c], label=sLabel)
- plt.legend(sLegend)
- plt.show()
- #=============================================================
- sInputFileName='C:/VKHCG/05-DS/9999-Data/2ch-sound.wav'
- print('=====================================================')
- print('Processing : ', sInputFileName)
- print('=====================================================')
- InputRate, InputData = wavfile.read(sInputFileName)
- show_info("2 channel", InputData,InputRate)
- ProcessData=pd.DataFrame(InputData)
- sColumns= ['Ch1','Ch2']
- ProcessData.columns=sColumns
- OutputData=ProcessData
- sOutputFileName='C:/VKHCG/05-DS/9999-Data/HORUS-Audio-2ch.csv'
- OutputData.to_csv(sOutputFileName, index = False)
- #=============================================================
- sInputFileName='C:/VKHCG/05-DS/9999-Data/4ch-sound.wav'
- print('=====================================================')
- print('Processing : ', sInputFileName)
- print('=====================================================')
- InputRate, InputData = wavfile.read(sInputFileName)
- show_info("4 channel", InputData,InputRate)
- ProcessData=pd.DataFrame(InputData)
- sColumns= ['Ch1','Ch2','Ch3', 'Ch4']
- ProcessData.columns=sColumns
- OutputData=ProcessData
- sOutputFileName='C:/VKHCG/05-DS/9999-Data/HORUS-Audio-4ch.csv'
- OutputData.to_csv(sOutputFileName, index = False)
- #=============================================================
- sInputFileName='C:/VKHCG/05-DS/9999-Data/6ch-sound.wav'
- print('=====================================================')
- print('Processing : ', sInputFileName)
- print('=====================================================')
- InputRate, InputData = wavfile.read(sInputFileName)
- show_info("6 channel", InputData,InputRate)
- ProcessData=pd.DataFrame(InputData)
- sColumns= ['Ch1','Ch2','Ch3', 'Ch4', 'Ch5','Ch6']
- ProcessData.columns=sColumns
- OutputData=ProcessData
- sOutputFileName='C:/VKHCG/05-DS/9999-Data/HORUS-Audio-6ch.csv'
- OutputData.to_csv(sOutputFileName, index = False)
- #=============================================================
- sInputFileName='C:/VKHCG/05-DS/9999-Data/8ch-sound.wav'
- print('=====================================================')
- print('Processing : ', sInputFileName)
- print('=====================================================')
- InputRate, InputData = wavfile.read(sInputFileName)
- show_info("8 channel", InputData,InputRate)
- ProcessData=pd.DataFrame(InputData)
- sColumns= ['Ch1','Ch2','Ch3', 'Ch4', 'Ch5','Ch6','Ch7','Ch8']
- ProcessData.columns=sColumns
- OutputData=ProcessData
- sOutputFileName='C:/VKHCG/05-DS/9999-Data/HORUS-Audio-8ch.csv'
- OutputData.to_csv(sOutputFileName, index = False)
- print('=====================================================')
- print('Audio to HORUS - Done')
- ------------------------------------------------------------------
- Practical 3: Utilities and Auditing
- A. Fixers Utilities:
- Fixers enable your solution to take your existing data and fix a specific quality issue.
- import string
- import datetime as dt
- # 1 Removing leading or lagging spaces from a data entry
- print('#1 Removing leading or lagging spaces from a data entry');
- baddata = " Data Science with too many spaces is bad!!! "
- print('>',baddata,'<')
- cleandata=baddata.strip()
- print('>',cleandata,'<')
- *******************************************************
- # 2 Removing nonprintable characters from a data entry
- print('#2 Removing nonprintable characters from a data entry')
- printable = set(string.printable)
- baddata = "Data\x00Science with\x02 funny characters is \x10bad!!!"
- cleandata=''.join(filter(lambda x: x in string.printable,baddata))
- print('Bad Data : ',baddata);
- print('Clean Data : ',cleandata)
- ***************************************************************
- # 3 Reformatting data entry to match specific formatting criteria.
- # Convert YYYY/MM/DD to DD Month YYYY
- print('# 3 Reformatting data entry to match specific formatting criteria.')
- baddate = dt.date(2019, 10, 31)
- baddata=format(baddate,'%Y-%m-%d')
- gooddate = dt.datetime.strptime(baddata,'%Y-%m-%d')
- gooddata=format(gooddate,'%d %B %Y')
- print('Bad Data : ',baddata)
- print('Good Data : ',gooddata)
- -------------------------------------------------------------------
- C. Averaging of Data
- import pandas as pd
- sFileName=r"C:\Users\admin\Desktop\MSC(IT)-part1\IP_DATA_ALL.csv"
- print('Loading :',sFileName)
- IP_DATA_ALL=pd.read_csv(sFileName,header=0,low_memory=False,
- usecols=['Country','Place Name','Latitude','Longitude'], encoding="latin-1")
- IP_DATA_ALL.rename(columns={'Place Name': 'Place_Name'}, inplace=True)
- AllData=IP_DATA_ALL[['Country', 'Place_Name','Latitude']]
- print(AllData)
- MeanData=AllData.groupby(['Country', 'Place_Name'])['Latitude'].mean()
- print(MeanData)
- ---------------------------------------------------------------
- B. Data Binning or Bucketing
- import pandas as pd
- data = pd.read_csv(r"E:\Book1.csv")
- bins = [0,45,75,100]
- columns = ["not_satisfy","good","Excellent"]
- data1 = pd.cut(data['Marks'],bins,labels=columns)
- data1
- #note: make csv file book1 name,marks column
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- import numpy as np
- import matplotlib.mlab as mlab
- import matplotlib.pyplot as plt
- import scipy.stats as stats
- np.random.seed(0)
- # example data
- mu = 90 # mean of distribution
- sigma = 25 # standard deviation of distribution
- x = mu + sigma * np.random.randn(5000)
- num_bins = 25
- fig, ax = plt.subplots()
- # the histogram of the data
- n, bins, patches = ax.hist(x, num_bins, density=1)
- # add a 'best fit' line
- y = stats.norm.pdf(bins, mu, sigma)
- # mlab.normpdf(bins, mu, sigma)
- ax.plot(bins, y, '--')
- ax.set_xlabel('Example Data')
- ax.set_ylabel('Probability density')
- sTitle=r'Histogram ' + str(len(x)) + ' entries into ' + str(num_bins) + ' Bins: $\mu=' + str(mu) + '$, $\sigma=' +
- str(sigma) + '$'
- ax.set_title(sTitle)
- fig.tight_layout()
- sPathFig='C:/VKHCG/05-DS/4000-UL/0200-DU/DU-Histogram.png'
- fig.savefig(sPathFig)
- plt.show()
- -------------------------------------------------------------------
- D. Outlier Detection
- C:\VKHCG\05-DS\4000-UL\0200-DU\DU-Outliers.py
- Code:
- ################################################################
- import pandas as pd
- sFileName = r"C:\practical-data-science-master\VKHCG\01-Vermeulen\00-RawData\data\IP_DATA_ALL.csv"
- IP_DATA_ALL=pd.read_csv(sFileName,header=0,low_memory=False,
- usecols=['Country','Place Name','Latitude','Longitude'], encoding="latin-1")
- LondonData=IP_DATA_ALL.loc[IP_DATA_ALL['Place Name']=='New York']
- AllData=LondonData[['Country', 'Place Name','Latitude']]
- print(AllData)
- MeanData=AllData.groupby(['Country', 'Place Name'])['Latitude'].mean()
- StdData=AllData.groupby(['Country', 'Place Name'])['Latitude'].std()
- print('Outliers')
- UpperBound=float(MeanData+StdData)
- print('Higher than ', UpperBound)
- OutliersHigher=AllData[AllData.Latitude>UpperBound]
- print(OutliersHigher)
- LowerBound=float(MeanData-StdData)
- print('Lower than ', LowerBound)
- OutliersLower=AllData[AllData.Latitude<LowerBound]
- print(OutliersLower)
- print('Not Outliers')
- OutliersNot=AllData[(AllData.Latitude>=LowerBound) & (AllData.Latitude<=UpperBound)]
- print(OutliersNot)
- -----------------------------------------
- ----Write a Python / R program for basic logging in data science.
- import logging
- logging.basicConfig(level=logging.DEBUG,
- format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
- filename=r'C:\Users\admin\Desktop\MSC(IT)-part1\abc.log',
- filemode='w')
- logger = logging.getLogger()
- import math
- def equation(a,b,c):
- logger.debug("compute disc")
- disc = b**2 - 4*a*c
- logger.debug("compute roots")
- root1 = -b + math.sqrt(disc)/ 2*a
- root2 = -b - math.sqrt(disc)/ 2*a
- logger.debug("return roots")
- return root1 ,root2
- equation(1,0,-4)
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- import sys
- import os
- import logging
- import uuid
- import shutil
- import time
- ############################################################
- Base='C:/VKHCG'
- ############################################################
- sCompanies=['01-Vermeulen','02-Krennwallner','03-Hillman','04-Clark']
- sLayers=['01-Retrieve','02-Assess','03-Process','04-Transform','05-Organise','06-Report']
- sLevels=['debug','info','warning','error']
- for sCompany in sCompanies:
- sFileDir=Base + '/' + sCompany
- if not os.path.exists(sFileDir):
- os.makedirs(sFileDir)
- for sLayer in sLayers:
- log = logging.getLogger() # root logger
- for hdlr in log.handlers[:]: # remove all old handlers
- log.removeHandler(hdlr)
- #----------------------------------------------------------------------------------
- sFileDir=Base + '/' + sCompany + '/' + sLayer + '/Logging'
- if os.path.exists(sFileDir):
- shutil.rmtree(sFileDir)
- time.sleep(2)
- if not os.path.exists(sFileDir):
- os.makedirs(sFileDir)
- skey=str(uuid.uuid4())
- sLogFile=Base + '/' + sCompany + '/' + sLayer + '/Logging/Logging_'+skey+'.log'
- print('Set up:',sLogFile)
- # set up logging to file - see previous section for more details
- logging.basicConfig(level=logging.DEBUG,
- format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
- datefmt=fmt='%m-%d%H:%M',
- filename=sLogFile,
- filemode='w')
- # define a Handler which writes INFO messages or higher to the sys.stderr
- console = logging.StreamHandler()
- console.setLevel(logging.INFO)
- # set a format which is simpler for console use
- formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
- # tell the handler to use this format
- console.setFormatter(formatter)
- # add the handler to the root logger
- logging.getLogger('').addHandler(console)
- # Now, we can log to the root logger, or any other logger. First the root...
- logging.info('Practical Data Science is fun!.')
- for sLevel in sLevels:
- sApp='Apllication-'+ sCompany + '-' + sLayer + '-' + sLevel
- logger = logging.getLogger(sApp)
- if sLevel == 'debug':
- logger.debug('Practical Data Science logged a debugging message.')
- if sLevel == 'info':
- logger.info('Practical Data Science logged information message.')
- if sLevel == 'warning':
- logger.warning('Practical Data Science logged a warning message.')
- if sLevel == 'error':
- logger.error('Practical Data Science logged an error message.')
- -------------------------------------------------------------
- Retrieve Superstep
- A) Aim: - Perform the following data processing using R.
- library(readr)
- Doctor_Details1 <- read_csv("Doctor_Details1.csv")
- View(Doctor_Details1)
- spec(Doctor_Details1)
- > library(readr)
- > Doctor_Details1 <- read_csv("Doctor_Details1.csv")
- Parsed with column specification:
- cols(
- Doc_Id = col_double(),
- Doc_Name = col_character(),
- Doc_Dep = col_character(),
- Start_Time = col_time(format = ""),
- End_Time = col_time(format = ""),
- Doc_Salary = col_double(),
- No_of_patients = col_double()
- )
- > view(Doctor_Details1)
- > spec(Doctor_Details1)
- cols(
- Doc_Id = col_double(),
- Doc_Name = col_character(),
- Doc_Dep = col_character(),
- Start_Time = col_time(format = ""),
- End_Time = col_time(format = ""),
- Doc_Salary = col_double(),
- No_of_patients = col_double()
- )
- library(tibble)
- set_tidy_names(Doctor_Details1, syntactic = TRUE, quiet = FALSE)
- > library(tibble)
- > set_tidy_names(Doctor_Details1, syntactic = TRUE, quiet = FALSE)
- # A tibble: 79 x 7
- Doc_Id Doc_Name Doc_Dep Start_Time End_Time Doc_Salary
- <dbl> <chr> <chr> <time> <time> <dbl>
- 1 1 Akhil Dots 07:51 03:34 82148
- 2 2 Nikil Eye 07:29 02:13 72824
- 3 3 Sharma Eye 07:58 02:31 89707
- 4 4 Dev E&T 07:26 05:51 59634
- 5 5 Das Dots 07:31 05:00 68756
- 6 6 Shilpa E&T 06:04 01:33 79237
- 7 7 Shetty Dots 07:36 04:23 85601
- 8 8 Darshit General 08:29 04:56 66319
- 9 9 Abhishek Eye 08:00 02:38 61841
- 10 10 Shrikant General 06:30 02:39 54774
- # ... with 69 more rows, and 1 more variable:
- # No_of_patients <dbl>
- Doctor_Details1_Fix = set_tidy_names(Doctor_Details1, syntactic = TRUE, quiet = FALSE)
- sapply(Doctor_Details1_Fix,typeof)
- > Doctor_Details1_Fix = set_tidy_names(Doctor_Details1, syntactic = TRUE, quiet = FALSE)
- > sapply(Doctor_Details1_Fix,typeof)
- Doc_Id Doc_Name Doc_Dep
- "double" "character" "character"
- Start_Time End_Time Doc_Salary
- "double" "double" "double"
- No_of_patients
- "double"
- library(data.table)
- hist_DocName=data.table(DocName=unique(DocDetails_Fix[is.na(DocDetails_Fix['DocName'])==0,]$DocName))
- view(hist_DocName)
- > view(hist_DocName)
- sapply(DocDetails['DocSalary'],mean,na.rm=TRUE)
- > sapply(DocDetails['DocSalary'],mean,na.rm=TRUE)
- DocSalary
- 71990.71
- -----------------------------------------------------------------
- B) Aim: - Program to retrieve different attributes of data.
- Code: -
- import sys
- import os
- import pandas as pd
- sFileName='C:\Ab\VKHCG\Doctor_Details1.csv'
- IP_DATA_ALL=pd.read_csv(sFileName,header=0,low_memory=False, encoding="latin-1")
- sFileDir='C:/Ab/VKHCG/'
- if not os.path.exists(sFileDir):
- os.makedirs(sFileDir)
- print('Rows:', IP_DATA_ALL.shape[0])
- print('Columns:', IP_DATA_ALL.shape[1])
- print('### Raw Data Set #####################################')
- for i in range(0,len(IP_DATA_ALL.columns)):
- print(IP_DATA_ALL.columns[i],type(IP_DATA_ALL.columns[i]))
- print('### Fixed Data Set ###################################')
- IP_DATA_ALL_FIX=IP_DATA_ALL
- for i in range(0,len(IP_DATA_ALL.columns)):
- cNameOld=IP_DATA_ALL_FIX.columns[i] + ' '
- cNameNew=cNameOld.strip().replace(" ", ".")
- IP_DATA_ALL_FIX.columns.values[i] = cNameNew
- print(IP_DATA_ALL.columns[i],type(IP_DATA_ALL.columns[i]))
- print('Fixed Data Set with ID')
- IP_DATA_ALL_with_ID=IP_DATA_ALL_FIX
- IP_DATA_ALL_with_ID.index.names = ['RowID']
- #print(IP_DATA_ALL_with_ID.head())
- sFileName2=sFileDir + '/Retrieve_IP_DATA.csv'
- IP_DATA_ALL_with_ID.to_csv(sFileName2, index = True, encoding="latin-1")
- print('### Done!! ############################################')
- -------------------------------------------------------------------
- C) Aim: - Data Pattern.
- Code: -
- library(readr)
- library(data.table)
- FileName=paste0('C:/Ab/VKHCG/DocDetails.csv')
- IP_DATA_ALL <- read_csv(FileName)
- hist_DocName=data.table(DocName=unique(IP_DATA_ALL$DocName))
- pattern_DocName=data.table(DocName=hist_DocName$DocName,PatternDocName=hist_DocName$DocName)
- oldchar=c(letters,LETTERS)
- newchar=replicate(length(oldchar),"A")
- for (r in seq(nrow(pattern_DocName))){
- s=pattern_DocName[r,]$PatternDocName;
- for (c in seq(length(oldchar))){
- s=chartr(oldchar[c],newchar[c],s)
- };
- for (n in seq(0,9,1)){
- s=chartr(as.character(n),"N",s)
- };
- s=chartr(" ","b",s)
- s=chartr(".","u",s)
- pattern_DocName[r,]$PatternDocName=s;
- };
- View(pattern_DocName)
- --------------------------------------------------------------
- D) Aim: - Loading IP_DATA_ALL.
- Code: -
- import sys
- import os
- import pandas as pd
- sFileName='C:\Ab\VKHCG\Doctor_Details1.csv'
- IP_DATA_ALL=pd.read_csv(sFileName,header=0,low_memory=False, encoding="latin-1")
- sFileDir='C:/Ab/VKHCG/'
- if not os.path.exists(sFileDir):
- os.makedirs(sFileDir)
- print('Rows:', IP_DATA_ALL.shape[0])
- print('Columns:', IP_DATA_ALL.shape[1])
- print('### Raw Data Set #####################################')
- for i in range(0,len(IP_DATA_ALL.columns)):
- print(IP_DATA_ALL.columns[i],type(IP_DATA_ALL.columns[i]))
- print('### Fixed Data Set ###################################')
- IP_DATA_ALL_FIX= IP_DATA_ALL
- for i in range(0,len(IP_DATA_ALL.columns)):
- cNameOld=IP_DATA_ALL_FIX.columns[i] + ' '
- cNameNew=cNameOld.strip().replace(" ", ".")
- IP_DATA_ALL_FIX.columns.values[i] = cNameNew
- print(IP_DATA_ALL.columns[i],type(IP_DATA_ALL.columns[i]))
- print('Fixed Data Set with ID')
- IP_DATA_ALL_with_ID=IP_DATA_ALL_FIX
- IP_DATA_ALL_with_ID.index.names = ['RowID']
- #print(IP_DATA_ALL_with_ID.head())
- sFileName2=sFileDir + '/Retrieve_IP_DATA.csv'
- IP_DATA_ALL_with_ID.to_csv(sFileName2, index = True, encoding="latin-1")
- print('### Done!! ############################################')
- --------------------------------------------------------------------------------------------
- A) Aim: - Perform error management on the given data using pandas package
- i. Drop the Columns Where All Elements Are Missing Values
- Code: -
- import sys
- import os
- import pandas as pd
- sInputFileName='Good-or-Bad.csv'
- sOutputFileName='Good-or-Bad-01.csv'
- sFileDir= 'C:/Ab/VKHCG/'
- if not os.path.exists(sFileDir):
- os.makedirs(sFileDir)
- sFileName='C:/Ab/VKHCG/' + sInputFileName
- RawData=pd.read_csv(sFileName,header=0)
- print('## Raw Data Values')
- print(RawData)
- print('## Data Profile')
- print('Rows :',RawData.shape[0])
- print('Columns :',RawData.shape[1])
- sFileName=sFileDir + '/' + sInputFileName
- RawData.to_csv(sFileName, index = False)
- TestData=RawData.dropna(axis=1, how='all')
- print('## Test Data Values')
- print(TestData)
- print('## Data Profile')
- print('Rows :',TestData.shape[0])
- print('Columns :',TestData.shape[1])
- sFileName=sFileDir + '/' + sOutputFileName
- TestData.to_csv(sFileName, index = False)
- print('### Done!! #####################')
- ?@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@?@@@
- ii. Drop the Columns Where Any of the Elements Is Missing Values
- Code: -
- import sys
- import os
- import pandas as pd
- sInputFileName='Good-or-Bad.csv'
- sOutputFileName='Good-or-Bad-01.csv'
- sFileDir= 'C:/Ab/VKHCG/'
- if not os.path.exists(sFileDir):
- os.makedirs(sFileDir)
- sFileName='C:/Ab/VKHCG/' + sInputFileName
- RawData=pd.read_csv(sFileName,header=0)
- print('## Raw Data Values')
- print(RawData)
- print('## Data Profile')
- print('Rows :',RawData.shape[0])
- print('Columns :',RawData.shape[1])
- sFileName=sFileDir + '/' + sInputFileName
- RawData.to_csv(sFileName, index = False)
- TestData=RawData.dropna(axis=1, how='any')
- print('## Test Data Values')
- print(TestData)
- print('## Data Profile')
- print('Rows :',TestData.shape[0])
- print('Columns :',TestData.shape[1])
- sFileName=sFileDir + '/' + sOutputFileName
- TestData.to_csv(sFileName, index = False)
- print('### Done!! #####################')
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@-
- iii. Keep Only the Rows That Contain a Maximum of Two Missing Values
- Code: -
- import sys
- import os
- import pandas as pd
- sInputFileName='Good-or-Bad.csv'
- sOutputFileName='Good-or-Bad-01.csv'
- sFileDir= 'C:/Ab/VKHCG/'
- if not os.path.exists(sFileDir):
- os.makedirs(sFileDir)
- sFileName='C:/Ab/VKHCG/' + sInputFileName
- RawData=pd.read_csv(sFileName,header=0)
- print('## Raw Data Values')
- print(RawData)
- print('## Data Profile')
- print('Rows :',RawData.shape[0])
- print('Columns :',RawData.shape[1])
- sFileName=sFileDir + '/' + sInputFileName
- RawData.to_csv(sFileName, index = False)
- TestData=RawData.dropna(thresh=2)
- print('## Test Data Values')
- print(TestData)
- print('## Data Profile')
- print('Rows :',TestData.shape[0])
- print('Columns :',TestData.shape[1])
- sFileName=sFileDir + '/' + sOutputFileName
- TestData.to_csv(sFileName, index = False)
- print('### Done!! #####################')
- ---------------------------------------------------------------
- B) Aim: - Write Python program to create the network routing diagram from the given data.
- Code: -
- import sys
- import os
- import pandas as pd
- pd.options.mode.chained_assignment = None
- Base= 'F:/VSIT/MscIT/Part-1/SEM-I/DS/Practs'
- sInputFileName1='/DocSalDetails.csv'
- sInputFileName2='/DocTimeDetails.csv'
- sInputFileName3='/DocDetails.csv'
- sOutputFileName='/DocDetailsST.csv'
- sFileName=Base + sInputFileName1
- print('Loading :',sFileName)
- SalData=pd.read_csv(sFileName,header=0,low_memory=False, encoding="latin-1")
- print('Loaded Salary Data:',SalData.columns.values)
- print('################################')
- print('Changed :',SalData.columns.values)
- SalData.rename(columns={'DocName': 'Doc_Name'}, inplace=True)
- SalData.rename(columns={'DocSalary': 'Doc_Salary'}, inplace=True)
- SalData.drop('DocDep', axis=1, inplace=True)
- print('To :',SalData.columns.values)
- print('################################')
- sFileName=Base + sInputFileName2
- print('Loading :',sFileName)
- TimeData=pd.read_csv(sFileName,header=0,low_memory=False, encoding="latin-1")
- print('Loaded Time Data:',TimeData.columns.values)
- print('################################')
- print('Changed :',TimeData.columns.values)
- TimeData.rename(columns={'StartTime': 'Start_Time'}, inplace=True)
- TimeData.rename(columns={'EndTime': 'End_Time'}, inplace=True)
- print('To :',TimeData.columns.values)
- print('################################')
- sFileName=Base + sInputFileName3
- print('Loading :',sFileName)
- DocRawData=pd.read_csv(sFileName,header=0,low_memory=False, encoding="latin-1")
- DocNetworkData=pd.merge(
- SalData,
- TimeData,
- how='inner',
- on='DocId'
- )
- ################################################################
- print('################################')
- print('Change ',DocNetworkData.columns.values)
- for i in DocNetworkData.columns.values:
- j='Doc_'+i
- DocNetworkData.rename(columns={i: j}, inplace=True)
- print('To ', DocNetworkData.columns.values)
- print('################################')
- sFileDir=Base
- if not os.path.exists(sFileDir):
- os.makedirs(sFileDir)
- sFileName=sFileDir + '/' + sOutputFileName
- print('Storing :', sFileName)
- DocNetworkData.to_csv(sFileName, index = False, encoding="latin-1")
- print('### Done!! #####################')
- @@@@@@@@@@@@@@@@@?@?@??@@?@@CONTINUE@@@@@@@@@@@@@@@@@@@@@@@@
- Code: -
- import sys
- import os
- import pandas as pd
- pd.options.mode.chained_assignment = None
- Base= 'F:/VSIT/MscIT/Part-1/SEM-I/DS/Practs'
- sInputFileName='/DocDetails.csv'
- sOutputFileName='/DocDetailsNode.csv'
- sFileName=Base + sInputFileName
- print('Loading :',sFileName)
- IPData=pd.read_csv(sFileName,header=0,low_memory=False, encoding="latin-1")
- print('Loaded IP :', IPData.columns.values)
- print('################################')
- print('Changed :',IPData.columns.values)
- IPData.drop('DocId', axis=1, inplace=True)
- IPData.drop('StartTime', axis=1, inplace=True)
- IPData.drop('EndTime', axis=1, inplace=True)
- IPData.drop('NoOfPatients', axis=1, inplace=True)
- IPData.rename(columns={'DocName': 'Doc_Name'}, inplace=True)
- IPData.rename(columns={'DocDept': 'Doc_Dept'}, inplace=True)
- IPData.rename(columns={'DocSalary': 'Doc_Salary'}, inplace=True)
- print('To :',IPData.columns.values)
- print('################################')
- print('Change ',IPData.columns.values)
- for i in IPData.columns.values:
- j='Node_'+i
- IPData.rename(columns={i: j}, inplace=True)
- print('To ', IPData.columns.values)
- print('################################')
- sFileDir=Base
- if not os.path.exists(sFileDir):
- os.makedirs(sFileDir)
- sFileName=sFileDir + '/' + sOutputFileName
- print('################################')
- print('Storing :', sFileName)
- print('################################')
- IPData.to_csv(sFileName, index = False, encoding="latin-1")
- -------------------------------------------------------------------
- C) Aim: - Write a Python / R program to build directed acyclic graph
- Code: -
- import networkx as nx
- import matplotlib.pyplot as plt
- import sys
- import os
- import pandas as pd
- Base= 'F:/VSIT/MscIT/Part-1/SEM-I/DS/Practs'
- sInputFileName='/DocDetails.csv'
- sOutputFileName1='DocDAG-1.png'
- sOutputFileName2='DocDAG-2.png'
- sFileName=Base + sInputFileName
- print('Loading :',sFileName)
- print('################################')
- DocData=pd.read_csv(sFileName,header=0,low_memory=False, encoding="latin-1")
- print('Loaded Doctor Data :',DocData.columns.values)
- print('################################')
- print(DocData)
- print('################################')
- print('Rows : ',DocData.shape[0])
- print('################################')
- G1=nx.DiGraph()
- G2=nx.DiGraph()
- for i in range(DocData.shape[0]):
- G1.add_node(DocData['DocDep'][i])
- sDepName= DocData['DocDep'][i] + '-' + DocData['DocDep'][i]
- G2.add_node(sDepName)
- print('################################')
- for n1 in G1.nodes():
- for n2 in G1.nodes():
- if n1 != n2:
- print('Link :',n1,' to ', n2)
- G1.add_edge(n1,n2)
- print("Nodes of graph: ")
- print(G1.nodes())
- print("Edges of graph: ")
- print(G1.edges())
- print('################################')
- sFileDir=Base
- if not os.path.exists(sFileDir):
- os.makedirs(sFileDir)
- sFileName=sFileDir + '/' + sOutputFileName1
- print('Storing :', sFileName)
- nx.draw(G1,pos=nx.spectral_layout(G1),
- nodecolor='r',edge_color='g',
- with_labels=True,node_size=8000,
- font_size=12)
- plt.savefig(sFileName) # save as png
- plt.show() # display
- print('################################')
- for n1 in G2.nodes():
- for n2 in G2.nodes():
- if n1 != n2:
- print('Link :',n1,' to ', n2)
- G2.add_edge(n1,n2)
- print("Nodes of graph: ")
- print(G2.nodes())
- print("Edges of graph: ")
- print(G2.edges())
- print('################################')
- sFileDir=Base
- if not os.path.exists(sFileDir):
- os.makedirs(sFileDir)
- sFileName=sFileDir + '/' + sOutputFileName2
- print('Storing :', sFileName)
- print('################################')
- nx.draw(G2,pos=nx.spectral_layout(G2),
- nodecolor='r',edge_color='b',
- with_labels=True,node_size=8000,
- font_size=12)
- plt.savefig(sFileName)
- plt.show()
- -----------------------------------------------------------------
- Process Superstep
- Aim: - Process Location
- Code: -
- import sys
- import os
- import pandas as pd
- import sqlite3 as sq
- from pandas.io import sql
- import uuid
- Base='C:/VKHCG'
- print('Working Base :',Base, ' using ', sys.platform)
- Company='01-Vermeulen'
- InputAssessGraphName='Assess_All_Animals.gml'
- EDSAssessDir='02-Assess/01-EDS'
- InputAssessDir=EDSAssessDir + '/02-Python'
- sFileAssessDir=Base + '/' + Company + '/' + InputAssessDir
- if not os.path.exists(sFileAssessDir):
- os.makedirs(sFileAssessDir)
- sDataBaseDir=Base + '/' + Company + '/03-Process/SQLite'
- if not os.path.exists(sDataBaseDir):
- os.makedirs(sDataBaseDir)
- sDatabaseName=sDataBaseDir + '/Vermeulen.db'
- conn1 = sq.connect(sDatabaseName)
- sDataVaultDir=Base + '/88-DV'
- if not os.path.exists(sDataBaseDir):
- os.makedirs(sDataBaseDir)
- sDatabaseName=sDataVaultDir + '/datavault.db'
- conn2 = sq.connect(sDatabaseName)
- t=0
- tMax=360*180
- for Longitude in range(-180,180,10):
- for Latitude in range(-90,90,10):
- t+=1
- IDNumber=str(uuid.uuid4())
- LocationName='L'+format(round(Longitude,3)*1000, '+07d') +\
- '-'+format(round(Longitude,3)*1000, '+07d')
- #LocationName='L'+str(Longitude)+'-'+str(Longitude)
- print('Create:',t,' of ',tMax,':',LocationName)
- LocationLine=[('ObjectBaseKey', ['GPS']),('IDNumber', [IDNumber]),('LocationNumber', [str(t)]),('LocationName', [LocationName]),('Longitude', [Longitude]),('Latitude', [Latitude])]
- if t==1:
- LocationFrame = pd.DataFrame.from_items(LocationLine)
- else:
- LocationRow = pd.DataFrame.from_items(LocationLine)
- LocationFrame = LocationFrame.append(LocationRow)
- LocationHubIndex=LocationFrame.set_index(['IDNumber'],inplace=False)
- sTable = 'Process-Location'
- print('Storing :',sDatabaseName,' Table:',sTable)
- LocationHubIndex.to_sql(sTable, conn1, if_exists="replace")
- sTable = 'Hub-Location'
- print('Storing :',sDatabaseName,' Table:',sTable)
- #LocationHubIndex.to_sql(sTable, conn2, if_exists="replace")
- print('Vacuum Databases')
- sSQL="VACUUM;"
- sql.execute(sSQL,conn1)
- #sql.execute(sSQL,conn2)
- print('################')
- print('### Done!! ############################################')
- --------------------------------------------------------------------------------------
- Aim: - Process Event
- Code: -
- import sys
- import os
- import pandas as pd
- import sqlite3 as sq
- from pandas.io import sql
- Base='C:/VKHCG'
- print('Working Base :',Base, ' using ', sys.platform)
- Company='01-Vermeulen'
- InputFileName='Action_Plan.csv'
- sDataBaseDir=Base + '/' + Company + '/03-Process/SQLite'
- if not os.path.exists(sDataBaseDir):
- os.makedirs(sDataBaseDir)
- sDatabaseName=sDataBaseDir + '/Vermeulen.db'
- conn1 = sq.connect(sDatabaseName)
- sDataVaultDir=Base + '/88-DV'
- if not os.path.exists(sDataBaseDir):
- os.makedirs(sDataBaseDir)
- sDatabaseName=sDataVaultDir + '/datavault.db'
- conn2 = sq.connect(sDatabaseName)
- sFileName=Base + '/' + Company + '/00-RawData/' + InputFileName
- print('Loading :',sFileName)
- EventRawData=pd.read_csv(sFileName,header=0,low_memory=False, encoding="latin-1")
- EventRawData.index.names=['EventID']
- EventHubIndex=EventRawData
- sTable = 'Process-Event'
- print('Storing :',sDatabaseName,' Table:',sTable)
- EventHubIndex.to_sql(sTable, conn1, if_exists="replace")
- sTable = 'Hub-Event'
- print('Storing :',sDatabaseName,' Table:',sTable)
- #EventHubIndex.to_sql(sTable, conn2, if_exists="replace")
- print('################')
- print('Vacuum Databases')
- sSQL="VACUUM;"
- sql.execute(sSQL,conn1)
- #sql.execute(sSQL,conn2)
- print('### Done!! ############################################')
- -----------------------------------------------------------------------------------------
- Aim: - Sun model
- Code: -
- import sys
- import os
- from datetime import datetime
- from pytz import timezone
- import pandas as pd
- import sqlite3 as sq
- import uuid
- pd.options.mode.chained_assignment = None
- sDatabaseName='E:/Data Science/Vermeulen.db'
- conn1 = sq.connect(sDatabaseName)
- sDatabaseName='E:/Data Science/datavault.db'
- conn2 = sq.connect(sDatabaseName)
- sDatabaseName='E:/Data Science/datawarehouse.db'
- conn3 = sq.connect(sDatabaseName)
- print('\n#################################')
- sSQL=" SELECT DateTimeValue FROM [Hub-Time-Gunnarsson];"
- DateDataRaw=pd.read_sql_query(sSQL, conn2)
- DateData=DateDataRaw.head(1000)
- print(DateData,'thisis Data')
- print('Time Dimension')
- print('\n#################################')
- t=0
- mt=DateData.shape[0]
- for i in range(mt):
- BirthZone = ('Atlantic/Reykjavik','Europe/London','UCT')
- for j in range(len(BirthZone)):
- t+=1
- print(t,mt*3)
- BirthDateZoneStr=DateData[DateTimeKey]
- BirthDateLocal=DateData[DateTimeValue]
- BirthZone='UCT'
- IDTimeNumber=str(uuid.uuid4())
- TimeLine=[('TimeID', [str(IDTimeNumber)]),
- ('UTCDate', [str(BirthDateZoneStr)]),
- ('LocalTime', [str(BirthDateLocal)]),
- ('TimeZone', [str(BirthZone)])]
- if t==1:
- TimeFrame = pd.DataFrame.from_items(TimeLine)
- else:
- TimeRow = pd.DataFrame.from_items(TimeLine)
- TimeFrame=TimeFrame.append(TimeRow)
- DimTime=TimeFrame
- DimTimeIndex=DimTime.set_index(['TimeID'],inplace=False)
- sTable = 'Dim-Time'
- print('\n#################################')
- print('Storing :',sDatabaseName,'\n Table:',sTable)
- print('\n#################################')
- DimTimeIndex.to_sql(sTable, conn1, if_exists="replace")
- DimTimeIndex.to_sql(sTable, conn3, if_exists="replace")
- sSQL=" SELECT " + \
- " FirstName," + \
- " SecondName," + \
- " LastName," + \
- " BirthDateKey " + \
- " FROM [Hub-Person];"
- PersonDataRaw=pd.read_sql_query(sSQL, conn2)
- PersonData=PersonDataRaw.head(1000)
- print('\n#################################')
- print('Dimension Person')
- print('\n#################################')
- t=0
- mt=DateData.shape[0]
- for i in range(mt):
- t+=1
- print(t,mt)
- FirstName = str(PersonData["FirstName"])
- SecondName = str(PersonData["SecondName"])
- if len(SecondName) > 0:
- SecondName=""
- LastName = str(PersonData["LastName"])
- BirthDateKey = str(PersonData["BirthDateKey"])
- IDPersonNumber=str(uuid.uuid4())
- PersonLine=[('PersonID', [str(IDPersonNumber)]),
- ('FirstName', [FirstName]),
- ('SecondName', [SecondName]),
- ('LastName', [LastName]),
- ('Zone', [str('UTC')]),
- ('BirthDate', [BirthDateKey])]
- if t==1:
- PersonFrame = pd.DataFrame.from_items(PersonLine)
- else:
- PersonRow = pd.DataFrame.from_items(PersonLine)
- PersonFrame = PersonFrame.append(PersonRow)
- DimPerson=PersonFrame
- print(DimPerson)
- DimPersonIndex=DimPerson.set_index(['PersonID'],inplace=False)
- sTable = 'Dim-Person'
- print('\n#################################')
- print('Storing :',sDatabaseName,'\n Table:',sTable)
- DimPersonIndex.to_sql(sTable, conn1, if_exists="replace")
- DimPersonIndex.to_sql(sTable, conn3, if_exists="replace")
- --------------------------------------------------------------------------------------
- Organize Superstep
- A) Aim: - Organizing Data
- Code: -
- import sys
- import os
- import pandas as pd
- import networkx as nx
- import matplotlib.pyplot as plt
- pd.options.mode.chained_assignment = None
- sFileName='C:\\VKHCG\\01-Vermeulen\\02-Assess\\01-EDS\\02-Python\\Assess-Network-Routing-Company1.csv'
- print('Loading :',sFileName)
- CompanyData=pd.read_csv(sFileName,header=0,low_memory=False, encoding="latin-1")
- print(CompanyData.head())
- print(CompanyData.shape)
- G=nx.Graph()
- for i in range(CompanyData.shape[0]):
- for j in range(CompanyData.shape[0]):
- Node0=CompanyData['Company_Country_Name'][i]
- Node1=CompanyData['Company_Country_Name'][j]
- if Node0 != Node1:
- G.add_edge(Node0,Node1)
- for i in range(CompanyData.shape[0]):
- Node0=CompanyData['Company_Country_Name'][i]
- Node1=CompanyData['Company_Place_Name'][i] + '('+ CompanyData['Company_Country_Name'][i] + ')'
- if Node0 != Node1:
- G.add_edge(Node0,Node1)
- print('Nodes:', G.number_of_nodes())
- print('Edges:', G.number_of_edges())
- sFileName='C:\\VKHCG\\01-Vermeulen\\02-Assess\\01-EDS\\02-Python\\Assess-Network-Routing-Company1.csv'
- print('Storing :',sFileName)
- #nx.write_gml(G, sFileName)
- sFileName='C:\\VKHCG\\01-Vermeulen\\02-Assess\\01-EDS\\02-Python\\Assess-Network-Routing-Company1.csv'
- print('Storing Graph Image:',sFileName)
- plt.figure(figsize=(15, 15))
- pos=nx.spectral_layout(G,dim=2)
- nx.draw_networkx_nodes(G,pos, node_color='k', node_size=10, alpha=0.8)
- nx.draw_networkx_edges(G, pos,edge_color='r', arrows=False, style='dashed')
- nx.draw_networkx_labels(G,pos,font_size=12,font_family='sans-serif',font_color='b')
- plt.axis('off')
- #plt.savefig(sFileName,dpi=600)
- plt.show()
- print('### Done!! #####################')
- ----------------------------------------------------------------------------------------
- B) Aim: - Organizing Data horizontally
- Code: -
- import sys
- import os
- import pandas as pd
- import sqlite3 as sq
- sDatabaseName='C:/Users/exam/Downloads/datawarehouse.db'
- conn1 = sq.connect(sDatabaseName)
- sDatabaseName= 'C:/Users/exam/Downloads/datamart.db'
- conn2 = sq.connect(sDatabaseName)
- sTable = 'Dim-BMI'
- print('Loading :',sDatabaseName,' Table:',sTable)
- sSQL="SELECT * FROM [Dim-BMI];"
- PersonFrame0=pd.read_sql_query(sSQL, conn1)
- sTable = 'Dim-BMI'
- print('Loading :',sDatabaseName,' Table:',sTable)
- sSQL="SELECT PersonID,\
- Height,\
- Weight,\
- bmi,\
- Indicator\
- FROM [Dim-BMI]\
- WHERE \
- Height > 1.5 \
- and Indicator = 1\
- ORDER BY \
- Height,\
- Weight;"
- PersonFrame1=pd.read_sql_query(sSQL, conn1)
- DimPerson=PersonFrame1
- DimPersonIndex=DimPerson.set_index(['PersonID'],inplace=False)
- sTable = 'Dim-BMI-Horizontal'
- print('Storing :',sDatabaseName,'\n Table:',sTable)
- DimPersonIndex.to_sql(sTable, conn2, if_exists="replace")
- sTable = 'Dim-BMI-Horizontal'
- print('Loading :',sDatabaseName,' Table:',sTable)
- sSQL="SELECT * FROM [Dim-BMI-Horizontal];"
- PersonFrame2=pd.read_sql_query(sSQL, conn2)
- print('Full Data Set (Rows):', PersonFrame0.shape[0])
- print('Full Data Set (Columns):', PersonFrame0.shape[1])
- print('Horizontal Data Set (Rows):', PersonFrame2.shape[0])
- print('Horizontal Data Set (Columns):', PersonFrame2.shape[1])
- ---------------------------------------------------------------------------------------------
- Aim: - Organizing Data Vertically
- Code: -
- import sys
- import os
- import pandas as pd
- import sqlite3 as sq
- sDatabaseName='C:/Users/exam/Downloads/datawarehouse.db'
- conn1 = sq.connect(sDatabaseName)
- sDatabaseName='C:/Users/exam/Downloads/datamart.db'
- conn2 = sq.connect(sDatabaseName)
- sTable = 'Dim-BMI'
- print('Loading :',sDatabaseName,' Table:',sTable)
- sSQL="SELECT * FROM [Dim-BMI];"
- PersonFrame0=pd.read_sql_query(sSQL, conn1)
- sTable = 'Dim-BMI'
- print('Loading :',sDatabaseName,' Table:',sTable)
- sSQL="SELECT \
- Height,\
- Weight,\
- Indicator\
- FROM [Dim-BMI];"
- PersonFrame1=pd.read_sql_query(sSQL, conn1)
- DimPerson=PersonFrame1
- DimPersonIndex=DimPerson.set_index(['Indicator'],inplace=False)
- sTable = 'Dim-BMI-Vertical'
- print('Storing :',sDatabaseName,'\n Table:',sTable)
- DimPersonIndex.to_sql(sTable, conn2, if_exists="replace")
- sTable = 'Dim-BMI-Vertical'
- print('Loading :',sDatabaseName,' Table:',sTable)
- sSQL="SELECT * FROM [Dim-BMI-Vertical];"
- PersonFrame2=pd.read_sql_query(sSQL, conn2)
- print('Full Data Set (Rows):', PersonFrame0.shape[0])
- print('Full Data Set (Columns):', PersonFrame0.shape[1])
- print('Horizontal Data Set (Rows):', PersonFrame2.shape[0])
- print('Horizontal Data Set (Columns):', PersonFrame2.shape[1])
- ----------------------------------------------------------------------------------------------
- D) Aim: - Organize Island
- Code: -
- import sys
- import os
- import pandas as pd
- import sqlite3 as sq
- sDatabaseName='C:/Users/exam/Downloads/datawarehouse.db'
- conn1 = sq.connect(sDatabaseName)
- ################################################################
- sDatabaseName='C:/Users/exam/Downloads/datamart(1).db'
- conn2 = sq.connect(sDatabaseName)
- ################################################################
- print('################')
- sTable = 'Dim-BMI'
- print('Loading :',sDatabaseName,' Table:',sTable)
- sSQL="SELECT * FROM [Dim-BMI];"
- PersonFrame0=pd.read_sql_query(sSQL, conn1)
- ################################################################
- print('################')
- sTable = 'Dim-BMI'
- print('Loading :',sDatabaseName,' Table:',sTable)
- sSQL="SELECT \
- Height,\
- Weight,\
- Indicator\
- FROM [Dim-BMI]\
- WHERE Indicator > 2\
- ORDER BY \
- Height,\
- Weight;"
- PersonFrame1=pd.read_sql_query(sSQL, conn1)
- ################################################################
- DimPerson=PersonFrame1
- DimPersonIndex=DimPerson.set_index(['Indicator'],inplace=False)
- ################################################################
- sTable = 'Dim-BMI-Vertical'
- print('\n#################################')
- print('Storing :',sDatabaseName,'\n Table:',sTable)
- print('\n#################################')
- DimPersonIndex.to_sql(sTable, conn2, if_exists="replace")
- ################################################################
- print('################################')
- sTable = 'Dim-BMI-Vertical'
- print('Loading :',sDatabaseName,' Table:',sTable)
- print('################################')
- sSQL="SELECT * FROM [Dim-BMI-Vertical];"
- PersonFrame2=pd.read_sql_query(sSQL, conn2)
- ################################################################
- print('################################')
- print('Full Data Set (Rows):', PersonFrame0.shape[0])
- print('Full Data Set (Columns):', PersonFrame0.shape[1])
- print('################################')
- print('Horizontal Data Set (Rows):', PersonFrame2.shape[0])
- print('Horizontal Data Set (Columns):', PersonFrame2.shape[1])
- --------------------------------------------------------------------------------------------------------
- Report Superstep
- Aim: - Generating Data
- Code: -
- import sys
- import os
- import pandas as pd
- import matplotlib as ml
- from matplotlib import pyplot as plt
- data=[
- ['London', 29.2, 17.4],
- ['Glasgow', 18.8, 11.3],
- ['Cape Town', 15.3, 9.0],
- ['Houston', 22.0, 7.8],
- ['Perth', 18.0, 23.7],
- ['San Francisco', 11.4, 33.3]]
- os_new=pd.DataFrame(data)
- pd.Index(['Item', 'Value', 'Value Percent', 'Conversions', 'Conversion Percent',
- 'URL', 'Stats URL'],
- dtype='object')
- os_new.rename(columns = {0 : "Warehouse Location"}, inplace=True)
- os_new.rename(columns = {1 : "Profit 2016"}, inplace=True)
- os_new.rename(columns = {2 : "Profit 2017"}, inplace=True)
- explode = (0, 0.2, 0, 0, 0, 0.1)
- labels=os_new['Warehouse Location']
- colors_mine = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral', 'lightcyan','lightblue']
- os_new.plot(figsize=(10, 10),kind="pie", y="Profit 2017",autopct='%.2f%%', \
- shadow=True, explode=explode, legend = False, colors = colors_mine,\
- labels=labels, fontsize=20)
- sPicNameOut1='E:/Data Science/pie_explode.png'
- plt.savefig(sPicNameOut1,dpi=600)
- plt.show()
- os_new.iloc[:5].plot(figsize=(10, 10),kind='line',x='Warehouse Location',\
- y=['Profit 2016','Profit 2017']);
- plt.show()
- os_new.iloc[:5].plot(figsize=(10, 10),kind='bar',x='Warehouse Location',\
- y=['Profit 2016','Profit 2017']);
- plt.show()
- os_new.iloc[:5].plot(figsize=(10, 10),kind='barh',x='Warehouse Location',\
- y=['Profit 2016','Profit 2017']);
- plt.show()
- os_new.iloc[:5].plot(figsize=(10, 10),kind='area',x='Warehouse Location',\
- y=['Profit 2016','Profit 2017'],stacked=False);
- plt.show()
- os_new.iloc[:5].plot(figsize=(10, 10),kind='scatter',x='Profit 2016',\
- y='Profit 2017',color='DarkBlue',marker='D');
- plt.show()
- os_new.iloc[:5].plot(figsize=(13, 10),kind='hexbin',x='Profit 2016',\
- y='Profit 2017', gridsize=25);
- plt.show()
- -----------------------------------------------------------------------------------------------------
Add Comment
Please, Sign In to add comment