Ds

Aim: - Creating data model using Cassandra

Step 1: -  run ‘cassandra.bat’ file as administrator to start the Cassandra server

Step 2: - Double click to execute the ‘cqlsh’ file in Python

Step 3: - Create a Data model as following:

cqlsh> create keyspace key_st_041 WITH replication={'class':'SimpleStrategy','replication_factor':3};
cqlsh> use key_st_041;
cqlsh:key_st_041> create table dept(dept_id int PRIMARY KEY,dept_name text,dept_location text);
cqlsh:key_st_041> create table emp(emp_id int PRIMARY KEY,emp_name text,dept_id int,emp_email text,emp_phone text);
cqlsh:key_st_041> Insert into dept (dept_id,dept_name,dept_location) values (1001,'Accounts','Mumbai');
cqlsh:key_st_041> Insert into dept (dept_id,dept_name,dept_location) values (1002,'Marketing','Chennai');
cqlsh:key_st_041> Insert into dept (dept_id,dept_name,dept_location) values (1002,'HR','Banglore');
cqlsh:key_st_041> Insert into emp (emp_id,emp_name,dept_id,emp_email,emp_phone) values (1001,'Siddhi Thakkar',1001,'s@email.com','1234567890');
cqlsh:key_st_041> Insert into emp (emp_id,emp_name,dept_id,emp_email,emp_phone) values (1002,'Riddhi Thakkar',1002,'r@email.com','8796541230');
cqlsh:key_st_041> Insert into emp (emp_id,emp_name,dept_id,emp_email,emp_phone) values (1003,'Ruchi Thakkar',1003,'rt@email.com','8796541889');
cqlsh:key_st_041>
cqlsh:key_st_041> select * from emp;

 emp_id | dept_id | emp_email    | emp_name       | emp_phone
--------+---------+--------------+----------------+------------
   1001 |    1001 |  s@email.com | Siddhi Thakkar | 1234567890
   1003 |    1003 | rt@email.com |  Ruchi Thakkar | 8796541889
   1002 |    1002 |  r@email.com | Riddhi Thakkar | 8796541230

(3 rows)
cqlsh:key_st_041> select * from dept;

 dept_id | dept_location | dept_name
---------+---------------+-----------
    1001 |        Mumbai |  Accounts
    1002 |      Banglore |        HR

(2 rows)
cqlsh:key_st_041> update dept set dept_name='Human Resources' where dept_id=1002;
cqlsh:key_st_041> select * from dept;

 dept_id | dept_location | dept_name
---------+---------------+-----------------
    1001 |        Mumbai |        Accounts
    1002 |      Banglore | Human Resources

(2 rows)
cqlsh:key_st_041> delete from emp where emp_id=1003;
cqlsh:key_st_041> select * from emp;

 emp_id | dept_id | emp_email   | emp_name       | emp_phone
--------+---------+-------------+----------------+------------
   1001 |    1001 | s@email.com | Siddhi Thakkar | 1234567890
   1002 |    1002 | r@email.com | Riddhi Thakkar | 8796541230

(2 rows)
cqlsh:key_st_041> alter table emp add emp_location text;
cqlsh:key_st_041> select * from emp;

 emp_id | dept_id | emp_email   | emp_location | emp_name       | emp_phone
--------+---------+-------------+--------------+----------------+------------
   1001 |    1001 | s@email.com |         null | Siddhi Thakkar | 1234567890
   1002 |    1002 | r@email.com |         null | Riddhi Thakkar | 8796541230

(2 rows)
cqlsh:key_st_041> alter table emp drop emp_location text;
SyntaxException: line 1:34 mismatched input 'text' expecting EOF (alter table emp drop emp_location [text]...)
cqlsh:key_st_041> alter table emp drop emp_location;
cqlsh:key_st_041> select * from emp;

 emp_id | dept_id | emp_email   | emp_name       | emp_phone
--------+---------+-------------+----------------+------------
   1001 |    1001 | s@email.com | Siddhi Thakkar | 1234567890
   1002 |    1002 | r@email.com | Riddhi Thakkar | 8796541230

(2 rows)
cqlsh:key_st_041>


------------------------------------------------------------------------------------------
A. Text delimited CSVto HORUS format.

Code:
import panda as pd
InputData=pd.read_csv(r'Desktop\Country.csv',encoding="latin-1")
print('Input Data Values ===================================')
print(InputData)
ProcessData.rename(columns={'Year': 'year'}, inplace=True)
ProcessData.rename(columns={'ccTLD': 'cctld'}, inplace=True)
ProcessData = InputData
ProcessData.to_csv(r"E:\country2.csv")
----------------------------------------------------------------------------

B. XML to HORUS Format

Code:

# Utility Start XML to HORUS =================================
# Standard Tools
import pandas as pd
import xml.etree.ElementTree as ET
def df2xml(data):
header = data.columns
root = ET.Element('root')
for row in range(data.shape[0]):
entry = ET.SubElement(root,'entry')
for index in range(data.shape[1]):
schild=str(header[index])
child = ET.SubElement(entry, schild)
if str(data[schild][row]) != 'nan':
child.text = str(data[schild][row])
else:
child.text = 'n/a'
entry.append(child)
result = ET.tostring(root)
return result
def xml2df(xml_data):
root = ET.XML(xml_data)
all_records = []
for i, child in enumerate(root):
record = {}
for subchild in child:
record[subchild.tag] = subchild.text
all_records.append(record)
return pd.DataFrame(all_records)
sInputFileName='C:/VKHCG/05-DS/9999-Data/Country_Code.xml'
InputData = open(sInputFileName).read()
print('=====================================================')
print('Input Data Values ===================================')
print('=====================================================')
print(InputData)
print('=====================================================')
ProcessDataXML=InputData
# XML to Data Frame
ProcessData=xml2df(ProcessDataXML)
# Remove columns ISO-2-Code and ISO-3-CODE
ProcessData.drop('ISO-2-CODE', axis=1,inplace=True)
ProcessData.drop('ISO-3-Code', axis=1,inplace=True)
# Rename Country and ISO-M49
ProcessData.rename(columns={'Country': 'CountryName'}, inplace=True)
ProcessData.rename(columns={'ISO-M49': 'CountryNumber'}, inplace=True)
# Set new Index
ProcessData.set_index('CountryNumber', inplace=True)
# Sort data by CurrencyNumber
ProcessData.sort_values('CountryName', axis=0, ascending=False, inplace=True)
print('=====================================================')
print('Process Data Values =================================')
print('=====================================================')
print(ProcessData)
print('=====================================================')
OutputData=ProcessData
sOutputFileName='C:/VKHCG/05-DS/9999-Data/HORUS-XML-Country.csv'
OutputData.to_csv(sOutputFileName, index = False)
print('=====================================================')
print('XML to HORUS - Done')
print('=====================================================')
-------------------------------------------------------------------

C. JSON to HORUS Format

Code:

# Utility Start JSON to HORUS =================================
# Standard Tools
#=============================================================
import pandas as pd
# Input Agreement ============================================
sInputFileName='C:/VKHCG/05-DS/9999-Data/Country_Code.json'
InputData=pd.read_json(sInputFileName, orient='index', encoding="latin-1")
print('Input Data Values ===================================')
print(InputData)
print('=====================================================')
# Processing Rules ===========================================
ProcessData=InputData
# Remove columns ISO-2-Code and ISO-3-CODE
ProcessData.drop('ISO-2-CODE', axis=1,inplace=True)
ProcessData.drop('ISO-3-Code', axis=1,inplace=True)
# Rename Country and ISO-M49
ProcessData.rename(columns={'Country': 'CountryName'}, inplace=True)
ProcessData.rename(columns={'ISO-M49': 'CountryNumber'}, inplace=True)
# Set new Index
ProcessData.set_index('CountryNumber', inplace=True)
# Sort data by CurrencyNumber
ProcessData.sort_values('CountryName', axis=0, ascending=False, inplace=True)
print('Process Data Values =================================')
print(ProcessData)
print('=====================================================')
# Output Agreement ===========================================
OutputData=ProcessData
sOutputFileName='c:/VKHCG/05-DS/9999-Data/HORUS-JSON-Country.csv'
OutputData.to_csv(sOutputFileName, index = False)
print('JSON to HORUS - Done')
-------------------------------------------------------------------


D. MySql Database to HORUS Format

Code:

import pandas as pd
import sqlite3 as sq

sInputFileName='C:/VKHCG/05-DS/9999-Data/utility.db'
sInputTable='Country_Code'
conn = sq.connect(sInputFileName)
sSQL='select * FROM ' + sInputTable + ';'
InputData=pd.read_sql_query(sSQL, conn)
print('Input Data Values ===================================')
print(InputData)
print('=====================================================')
ProcessData=InputData

ProcessData.drop('ISO-2-CODE', axis=1,inplace=True)
ProcessData.drop('ISO-3-Code', axis=1,inplace=True)

ProcessData.rename(columns={'Country': 'CountryName'}, inplace=True)
ProcessData.rename(columns={'ISO-M49': 'CountryNumber'}, inplace=True)

ProcessData.set_index('CountryNumber', inplace=True)
ProcessData.sort_values('CountryName', axis=0, ascending=False, inplace=True)
print('Process Data Values =================================')
print(ProcessData)
print('=====================================================')

OutputData=ProcessData
sOutputFileName='C:/VKHCG/05-DS/9999-Data/HORUS-CSV-Country.csv'
OutputData.to_csv(sOutputFileName, index = False)
print('Database to HORUS - Done')
-------------------------------------------------------------------

E. Picture (JPEG) to HORUS Format (Use SPYDER to run this program)

Code:

from scipy.misc import imread
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

sInputFileName='C:/VKHCG/05-DS/9999-Data/Angus.jpg'
InputData = imread(sInputFileName, flatten=False, mode='RGBA')
print('Input Data Values ===================================')
print('X: ',InputData.shape[0])
print('Y: ',InputData.shape[1])
print('RGBA: ', InputData.shape[2])
print('=====================================================')

ProcessRawData=InputData.flatten()
y=InputData.shape[2] + 2
x=int(ProcessRawData.shape[0]/y)
ProcessData=pd.DataFrame(np.reshape(ProcessRawData, (x, y)))
sColumns= ['XAxis','YAxis','Red', 'Green', 'Blue','Alpha']
ProcessData.columns=sColumns
ProcessData.index.names =['ID']
print('Rows: ',ProcessData.shape[0])
print('Columns :',ProcessData.shape[1])
print('=====================================================')
print('Process Data Values =================================')
print('=====================================================')
plt.imshow(InputData)
plt.show()
print('=====================================================')

OutputData=ProcessData
print('Storing File')
sOutputFileName='C:/VKHCG/05-DS/9999-Data/HORUS-Picture.csv'
OutputData.to_csv(sOutputFileName, index = False)
print('=====================================================')
print('Picture to HORUS - Done')
print('=====================================================')
-------------------------------------------------------------------


F. Video to HORUS Format

Code:

Movie to Frames
import os
import shutil
import cv2

sInputFileName='C:/VKHCG/05-DS/9999-Data/dog.mp4'
sDataBaseDir='C:/VKHCG/05-DS/9999-Data/temp'
if os.path.exists(sDataBaseDir):
shutil.rmtree(sDataBaseDir)
if not os.path.exists(sDataBaseDir):
os.makedirs(sDataBaseDir)
print('=====================================================')
print('Start Movie to Frames')
print('=====================================================')
vidcap = cv2.VideoCapture(sInputFileName)
success,image = vidcap.read()
count = 0
while success:
success,image = vidcap.read()
sFrame=sDataBaseDir + str('/dog-frame-' + str(format(count, '04d'))+ '.jpg')
print('Extracted: ', sFrame)
cv2.imwrite(sFrame, image)
if os.path.getsize(sFrame) == 0:
count += -1
os.remove(sFrame)
print('Removed: ', sFrame)
if cv2.waitKey(10) == 27: # exit if Escape is hit
break
count += 1
print('=====================================================')
print('Generated : ', count, ' Frames')
print('=====================================================')
print('Movie to Frames HORUS - Done')
print('=====================================================')
-------------------------------------------------------------------


Frames to Horus (Use SPYDER to run this program)

from scipy.misc import imread
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
# Input Agreement ============================================
sDataBaseDir='C:/VKHCG/05-DS/9999-Data/temp'
f=0
for file in os.listdir(sDataBaseDir):
if file.endswith(".jpg"):
f += 1
sInputFileName=os.path.join(sDataBaseDir, file)
print('Process : ', sInputFileName)
InputData = imread(sInputFileName, flatten=False, mode='RGBA')
print('Input Data Values ===================================')
print('X: ',InputData.shape[0])
print('Y: ',InputData.shape[1])
print('RGBA: ', InputData.shape[2])
print('=====================================================')
# Processing Rules ===========================================
ProcessRawData=InputData.flatten()
y=InputData.shape[2] + 2
x=int(ProcessRawData.shape[0]/y)
ProcessFrameData=pd.DataFrame(np.reshape(ProcessRawData, (x, y)))
ProcessFrameData['Frame']=file
print('=====================================================')
print('Process Data Values =================================')
print('=====================================================')
plt.imshow(InputData)
plt.show()
if f == 1:
ProcessData=ProcessFrameData
else:
ProcessData=ProcessData.append(ProcessFrameData)
if f > 0:
sColumns= ['XAxis','YAxis','Red', 'Green', 'Blue','Alpha','FrameName']
ProcessData.columns=sColumns
print('=====================================================')
ProcessFrameData.index.names =['ID']
print('Rows: ',ProcessData.shape[0])
print('Columns :',ProcessData.shape[1])
print('=====================================================')
# Output Agreement ===========================================
OutputData=ProcessData
print('Storing File')
sOutputFileName='C:/VKHCG/05-DS/9999-Data/HORUS-Movie-Frame.csv'
OutputData.to_csv(sOutputFileName, index = False)
print('=====================================================')
print('Processed ; ', f,' frames')
PSIT1P2 ~~~~~ Data Science Practical
M. Sc. [Information Technology] SEMESTER ~ I Teacher’s Reference Manual
21
print('=====================================================')
print('Movie to HORUS - Done')
print('=====================================================')
-------------------------------------------------------------------

G. Audio to HORUS Format

Code:

from scipy.io import wavfile
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
#=============================================================
def show_info(aname, a,r):
print ('----------------')
print ("Audio:", aname)
print ('----------------')
print ("Rate:", r)
print ('----------------')
print ("shape:", a.shape)
print ("dtype:", a.dtype)
print ("min, max:", a.min(), a.max())
print ('----------------')
plot_info(aname, a,r)
#=============================================================
def plot_info(aname, a,r):
sTitle= 'Signal Wave - '+ aname + ' at ' + str(r) + 'hz'
plt.title(sTitle)
sLegend=[]
for c in range(a.shape[1]):
sLabel = 'Ch' + str(c+1)
sLegend=sLegend+[str(c+1)]
plt.plot(a[:,c], label=sLabel)
plt.legend(sLegend)
plt.show()
#=============================================================
sInputFileName='C:/VKHCG/05-DS/9999-Data/2ch-sound.wav'
print('=====================================================')
print('Processing : ', sInputFileName)
print('=====================================================')
InputRate, InputData = wavfile.read(sInputFileName)
show_info("2 channel", InputData,InputRate)
ProcessData=pd.DataFrame(InputData)
sColumns= ['Ch1','Ch2']
ProcessData.columns=sColumns
OutputData=ProcessData
sOutputFileName='C:/VKHCG/05-DS/9999-Data/HORUS-Audio-2ch.csv'
OutputData.to_csv(sOutputFileName, index = False)
#=============================================================
sInputFileName='C:/VKHCG/05-DS/9999-Data/4ch-sound.wav'
print('=====================================================')
print('Processing : ', sInputFileName)
print('=====================================================')
InputRate, InputData = wavfile.read(sInputFileName)
show_info("4 channel", InputData,InputRate)
ProcessData=pd.DataFrame(InputData)
sColumns= ['Ch1','Ch2','Ch3', 'Ch4']
ProcessData.columns=sColumns
OutputData=ProcessData
sOutputFileName='C:/VKHCG/05-DS/9999-Data/HORUS-Audio-4ch.csv'
OutputData.to_csv(sOutputFileName, index = False)
#=============================================================
sInputFileName='C:/VKHCG/05-DS/9999-Data/6ch-sound.wav'
print('=====================================================')
print('Processing : ', sInputFileName)
print('=====================================================')
InputRate, InputData = wavfile.read(sInputFileName)
show_info("6 channel", InputData,InputRate)
ProcessData=pd.DataFrame(InputData)
sColumns= ['Ch1','Ch2','Ch3', 'Ch4', 'Ch5','Ch6']
ProcessData.columns=sColumns
OutputData=ProcessData
sOutputFileName='C:/VKHCG/05-DS/9999-Data/HORUS-Audio-6ch.csv'
OutputData.to_csv(sOutputFileName, index = False)
#=============================================================
sInputFileName='C:/VKHCG/05-DS/9999-Data/8ch-sound.wav'
print('=====================================================')
print('Processing : ', sInputFileName)
print('=====================================================')
InputRate, InputData = wavfile.read(sInputFileName)
show_info("8 channel", InputData,InputRate)
ProcessData=pd.DataFrame(InputData)
sColumns= ['Ch1','Ch2','Ch3', 'Ch4', 'Ch5','Ch6','Ch7','Ch8']
ProcessData.columns=sColumns
OutputData=ProcessData
sOutputFileName='C:/VKHCG/05-DS/9999-Data/HORUS-Audio-8ch.csv'
OutputData.to_csv(sOutputFileName, index = False)
print('=====================================================')
print('Audio to HORUS - Done')
------------------------------------------------------------------


Practical 3: Utilities and Auditing

A. Fixers Utilities:
Fixers enable your solution to take your existing data and fix a specific quality issue.

import string
import datetime as dt
# 1 Removing leading or lagging spaces from a data entry
print('#1 Removing leading or lagging spaces from a data entry');
baddata = " Data Science with too many spaces is bad!!! "
print('>',baddata,'<')
cleandata=baddata.strip()
print('>',cleandata,'<')
*******************************************************

# 2 Removing nonprintable characters from a data entry
print('#2 Removing nonprintable characters from a data entry')
printable = set(string.printable)
baddata = "Data\x00Science with\x02 funny characters is \x10bad!!!"
cleandata=''.join(filter(lambda x: x in string.printable,baddata))
print('Bad Data : ',baddata);
print('Clean Data : ',cleandata)
***************************************************************

# 3 Reformatting data entry to match specific formatting criteria.
# Convert YYYY/MM/DD to DD Month YYYY
print('# 3 Reformatting data entry to match specific formatting criteria.')
baddate = dt.date(2019, 10, 31)
baddata=format(baddate,'%Y-%m-%d')
gooddate = dt.datetime.strptime(baddata,'%Y-%m-%d')
gooddata=format(gooddate,'%d %B %Y')
print('Bad Data : ',baddata)
print('Good Data : ',gooddata)
-------------------------------------------------------------------

C. Averaging of Data

import pandas as pd
sFileName=r"C:\Users\admin\Desktop\MSC(IT)-part1\IP_DATA_ALL.csv"
print('Loading :',sFileName)
IP_DATA_ALL=pd.read_csv(sFileName,header=0,low_memory=False,
usecols=['Country','Place Name','Latitude','Longitude'], encoding="latin-1")
IP_DATA_ALL.rename(columns={'Place Name': 'Place_Name'}, inplace=True)
AllData=IP_DATA_ALL[['Country', 'Place_Name','Latitude']]
print(AllData)
MeanData=AllData.groupby(['Country', 'Place_Name'])['Latitude'].mean()
print(MeanData)
---------------------------------------------------------------

B. Data Binning or Bucketing

import pandas as pd
data = pd.read_csv(r"E:\Book1.csv")
bins = [0,45,75,100]
columns = ["not_satisfy","good","Excellent"]
data1 = pd.cut(data['Marks'],bins,labels=columns)
data1

#note: make csv file book1 name,marks column
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import scipy.stats as stats
np.random.seed(0)
# example data
mu = 90 # mean of distribution
sigma = 25 # standard deviation of distribution
x = mu + sigma * np.random.randn(5000)
num_bins = 25
fig, ax = plt.subplots()
# the histogram of the data
n, bins, patches = ax.hist(x, num_bins, density=1)
# add a 'best fit' line
y = stats.norm.pdf(bins, mu, sigma)
# mlab.normpdf(bins, mu, sigma)
ax.plot(bins, y, '--')
ax.set_xlabel('Example Data')
ax.set_ylabel('Probability density')
sTitle=r'Histogram ' + str(len(x)) + ' entries into ' + str(num_bins) + ' Bins: $\mu=' + str(mu) + '$, $\sigma=' +
str(sigma) + '$'
ax.set_title(sTitle)
fig.tight_layout()
sPathFig='C:/VKHCG/05-DS/4000-UL/0200-DU/DU-Histogram.png'
fig.savefig(sPathFig)
plt.show()
-------------------------------------------------------------------

D. Outlier Detection

C:\VKHCG\05-DS\4000-UL\0200-DU\DU-Outliers.py
Code:
################################################################


import pandas as pd
sFileName = r"C:\practical-data-science-master\VKHCG\01-Vermeulen\00-RawData\data\IP_DATA_ALL.csv"
IP_DATA_ALL=pd.read_csv(sFileName,header=0,low_memory=False,
usecols=['Country','Place Name','Latitude','Longitude'], encoding="latin-1")
LondonData=IP_DATA_ALL.loc[IP_DATA_ALL['Place Name']=='New York']
AllData=LondonData[['Country', 'Place Name','Latitude']]
print(AllData)
MeanData=AllData.groupby(['Country', 'Place Name'])['Latitude'].mean()
StdData=AllData.groupby(['Country', 'Place Name'])['Latitude'].std()
print('Outliers')
UpperBound=float(MeanData+StdData)
print('Higher than ', UpperBound)
OutliersHigher=AllData[AllData.Latitude>UpperBound]
print(OutliersHigher)
LowerBound=float(MeanData-StdData)
print('Lower than ', LowerBound)
OutliersLower=AllData[AllData.Latitude<LowerBound]
print(OutliersLower)
print('Not Outliers')
OutliersNot=AllData[(AllData.Latitude>=LowerBound) & (AllData.Latitude<=UpperBound)]
print(OutliersNot)
-----------------------------------------

----Write a Python / R program for basic logging in data science.

import logging
logging.basicConfig(level=logging.DEBUG,
    format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
    filename=r'C:\Users\admin\Desktop\MSC(IT)-part1\abc.log',

    filemode='w')
logger = logging.getLogger()
import math
def equation(a,b,c):
    logger.debug("compute disc")
    disc = b**2 - 4*a*c
    logger.debug("compute roots")
    root1 = -b + math.sqrt(disc)/ 2*a
    root2 = -b - math.sqrt(disc)/ 2*a
    logger.debug("return roots")
    return root1 ,root2
equation(1,0,-4)
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

import sys
import os
import logging
import uuid
import shutil
import time
############################################################
Base='C:/VKHCG'
############################################################
sCompanies=['01-Vermeulen','02-Krennwallner','03-Hillman','04-Clark']
sLayers=['01-Retrieve','02-Assess','03-Process','04-Transform','05-Organise','06-Report']
sLevels=['debug','info','warning','error']
for sCompany in sCompanies:
    sFileDir=Base + '/' + sCompany
    if not os.path.exists(sFileDir):
        os.makedirs(sFileDir)
    for sLayer in sLayers:
        log = logging.getLogger() # root logger
    for hdlr in log.handlers[:]: # remove all old handlers
        log.removeHandler(hdlr)
#----------------------------------------------------------------------------------
sFileDir=Base + '/' + sCompany + '/' + sLayer + '/Logging'
if os.path.exists(sFileDir):
    shutil.rmtree(sFileDir)
time.sleep(2)
if not os.path.exists(sFileDir):
    os.makedirs(sFileDir)
skey=str(uuid.uuid4())
sLogFile=Base + '/' + sCompany + '/' + sLayer + '/Logging/Logging_'+skey+'.log'
print('Set up:',sLogFile)
# set up logging to file - see previous section for more details
logging.basicConfig(level=logging.DEBUG,
    format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
    datefmt=fmt='%m-%d%H:%M',
    filename=sLogFile,
    filemode='w')
# define a Handler which writes INFO messages or higher to the sys.stderr
console = logging.StreamHandler()
console.setLevel(logging.INFO)
# set a format which is simpler for console use
formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
# tell the handler to use this format
console.setFormatter(formatter)
# add the handler to the root logger
logging.getLogger('').addHandler(console)
# Now, we can log to the root logger, or any other logger. First the root...
logging.info('Practical Data Science is fun!.')
for sLevel in sLevels:
    sApp='Apllication-'+ sCompany + '-' + sLayer + '-' + sLevel
    logger = logging.getLogger(sApp)
    if sLevel == 'debug':
        logger.debug('Practical Data Science logged a debugging message.')
    if sLevel == 'info':
        logger.info('Practical Data Science logged information message.')
    if sLevel == 'warning':
        logger.warning('Practical Data Science logged a warning message.')
    if sLevel == 'error':
        logger.error('Practical Data Science logged an error message.')
-------------------------------------------------------------

Retrieve Superstep

A) Aim: - Perform the following data processing using R.
library(readr)

Doctor_Details1 <- read_csv("Doctor_Details1.csv")
View(Doctor_Details1)
spec(Doctor_Details1)

> library(readr)
> Doctor_Details1 <- read_csv("Doctor_Details1.csv")
Parsed with column specification:
cols(
  Doc_Id = col_double(),
  Doc_Name = col_character(),
  Doc_Dep = col_character(),
  Start_Time = col_time(format = ""),
  End_Time = col_time(format = ""),
  Doc_Salary = col_double(),
  No_of_patients = col_double()
)
> view(Doctor_Details1)

> spec(Doctor_Details1)
cols(
  Doc_Id = col_double(),
  Doc_Name = col_character(),
  Doc_Dep = col_character(),
  Start_Time = col_time(format = ""),
  End_Time = col_time(format = ""),
  Doc_Salary = col_double(),
  No_of_patients = col_double()
)

library(tibble)
set_tidy_names(Doctor_Details1, syntactic = TRUE, quiet = FALSE)

> library(tibble)
> set_tidy_names(Doctor_Details1, syntactic = TRUE, quiet = FALSE)
# A tibble: 79 x 7
   Doc_Id Doc_Name Doc_Dep Start_Time End_Time Doc_Salary
    <dbl> <chr>    <chr>   <time>     <time>        <dbl>
 1      1 Akhil    Dots    07:51      03:34         82148
 2      2 Nikil    Eye     07:29      02:13         72824
 3      3 Sharma   Eye     07:58      02:31         89707
 4      4 Dev      E&T     07:26      05:51         59634
 5      5 Das      Dots    07:31      05:00         68756
 6      6 Shilpa   E&T     06:04      01:33         79237
 7      7 Shetty   Dots    07:36      04:23         85601
 8      8 Darshit  General 08:29      04:56         66319
 9      9 Abhishek Eye     08:00      02:38         61841
10     10 Shrikant General 06:30      02:39         54774
# ... with 69 more rows, and 1 more variable:
#   No_of_patients <dbl>

Doctor_Details1_Fix = set_tidy_names(Doctor_Details1, syntactic = TRUE, quiet = FALSE)
sapply(Doctor_Details1_Fix,typeof)

> Doctor_Details1_Fix = set_tidy_names(Doctor_Details1, syntactic = TRUE, quiet = FALSE)
> sapply(Doctor_Details1_Fix,typeof)
        Doc_Id       Doc_Name        Doc_Dep
      "double"    "character"    "character"
    Start_Time       End_Time     Doc_Salary
      "double"       "double"       "double"
No_of_patients
      "double"

library(data.table)
hist_DocName=data.table(DocName=unique(DocDetails_Fix[is.na(DocDetails_Fix['DocName'])==0,]$DocName))
view(hist_DocName)

> view(hist_DocName)


sapply(DocDetails['DocSalary'],mean,na.rm=TRUE)
> sapply(DocDetails['DocSalary'],mean,na.rm=TRUE)
DocSalary
 71990.71
-----------------------------------------------------------------

B) Aim: - Program to retrieve different attributes of data.
Code: -

import sys
import os
import pandas as pd

sFileName='C:\Ab\VKHCG\Doctor_Details1.csv'
IP_DATA_ALL=pd.read_csv(sFileName,header=0,low_memory=False, encoding="latin-1")
sFileDir='C:/Ab/VKHCG/'
if not os.path.exists(sFileDir):
    os.makedirs(sFileDir)
print('Rows:', IP_DATA_ALL.shape[0])
print('Columns:', IP_DATA_ALL.shape[1])
print('### Raw Data Set #####################################')

for i in range(0,len(IP_DATA_ALL.columns)):
        print(IP_DATA_ALL.columns[i],type(IP_DATA_ALL.columns[i]))
print('### Fixed Data Set ###################################')
IP_DATA_ALL_FIX=IP_DATA_ALL
for i in range(0,len(IP_DATA_ALL.columns)):
    cNameOld=IP_DATA_ALL_FIX.columns[i] + ' '
    cNameNew=cNameOld.strip().replace(" ", ".")
    IP_DATA_ALL_FIX.columns.values[i] = cNameNew
    print(IP_DATA_ALL.columns[i],type(IP_DATA_ALL.columns[i]))
print('Fixed Data Set with ID')

IP_DATA_ALL_with_ID=IP_DATA_ALL_FIX
IP_DATA_ALL_with_ID.index.names = ['RowID']
#print(IP_DATA_ALL_with_ID.head())
sFileName2=sFileDir + '/Retrieve_IP_DATA.csv'
IP_DATA_ALL_with_ID.to_csv(sFileName2, index = True, encoding="latin-1")
print('### Done!! ############################################')

-------------------------------------------------------------------

C) Aim: - Data Pattern.

Code: -

library(readr)
library(data.table)
FileName=paste0('C:/Ab/VKHCG/DocDetails.csv')
IP_DATA_ALL <- read_csv(FileName)
hist_DocName=data.table(DocName=unique(IP_DATA_ALL$DocName))
pattern_DocName=data.table(DocName=hist_DocName$DocName,PatternDocName=hist_DocName$DocName)
oldchar=c(letters,LETTERS)
newchar=replicate(length(oldchar),"A")
for (r in seq(nrow(pattern_DocName))){
  s=pattern_DocName[r,]$PatternDocName;
  for (c in seq(length(oldchar))){
    s=chartr(oldchar[c],newchar[c],s)
  };
  for (n in seq(0,9,1)){
    s=chartr(as.character(n),"N",s)
  };
  s=chartr(" ","b",s)
  s=chartr(".","u",s)
  pattern_DocName[r,]$PatternDocName=s;
};
View(pattern_DocName)
--------------------------------------------------------------

D) Aim: - Loading IP_DATA_ALL.

Code: -

import sys
import os
import pandas as pd
sFileName='C:\Ab\VKHCG\Doctor_Details1.csv'
IP_DATA_ALL=pd.read_csv(sFileName,header=0,low_memory=False, encoding="latin-1")
sFileDir='C:/Ab/VKHCG/'
if not os.path.exists(sFileDir):
    os.makedirs(sFileDir)
print('Rows:', IP_DATA_ALL.shape[0])
print('Columns:', IP_DATA_ALL.shape[1])
print('### Raw Data Set #####################################')
for i in range(0,len(IP_DATA_ALL.columns)):
    print(IP_DATA_ALL.columns[i],type(IP_DATA_ALL.columns[i]))
print('### Fixed Data Set ###################################')
IP_DATA_ALL_FIX= IP_DATA_ALL
for i in range(0,len(IP_DATA_ALL.columns)):
    cNameOld=IP_DATA_ALL_FIX.columns[i] + ' '
    cNameNew=cNameOld.strip().replace(" ", ".")
    IP_DATA_ALL_FIX.columns.values[i] = cNameNew
    print(IP_DATA_ALL.columns[i],type(IP_DATA_ALL.columns[i]))
print('Fixed Data Set with ID')
IP_DATA_ALL_with_ID=IP_DATA_ALL_FIX
IP_DATA_ALL_with_ID.index.names = ['RowID']
#print(IP_DATA_ALL_with_ID.head())
sFileName2=sFileDir + '/Retrieve_IP_DATA.csv'
IP_DATA_ALL_with_ID.to_csv(sFileName2, index = True, encoding="latin-1")
print('### Done!! ############################################')

--------------------------------------------------------------------------------------------


A) Aim: - Perform error management on the given data using pandas package

i. Drop the Columns Where All Elements Are Missing Values

Code: -

import sys
import os
import pandas as pd
sInputFileName='Good-or-Bad.csv'
sOutputFileName='Good-or-Bad-01.csv'
sFileDir= 'C:/Ab/VKHCG/'
if not os.path.exists(sFileDir):
    os.makedirs(sFileDir)
sFileName='C:/Ab/VKHCG/' + sInputFileName
RawData=pd.read_csv(sFileName,header=0)
print('## Raw Data Values')
print(RawData)
print('## Data Profile')
print('Rows :',RawData.shape[0])
print('Columns :',RawData.shape[1])
sFileName=sFileDir + '/' + sInputFileName
RawData.to_csv(sFileName, index = False)
TestData=RawData.dropna(axis=1, how='all')
print('## Test Data Values')
print(TestData)
print('## Data Profile')
print('Rows :',TestData.shape[0])
print('Columns :',TestData.shape[1])
sFileName=sFileDir + '/' + sOutputFileName
TestData.to_csv(sFileName, index = False)
print('### Done!! #####################')
?@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@?@@@

ii. Drop the Columns Where Any of the Elements Is Missing Values

Code: -
import sys
import os
import pandas as pd
sInputFileName='Good-or-Bad.csv'
sOutputFileName='Good-or-Bad-01.csv'
sFileDir= 'C:/Ab/VKHCG/'
if not os.path.exists(sFileDir):
    os.makedirs(sFileDir)
sFileName='C:/Ab/VKHCG/' + sInputFileName
RawData=pd.read_csv(sFileName,header=0)
print('## Raw Data Values')
print(RawData)
print('## Data Profile')
print('Rows :',RawData.shape[0])
print('Columns :',RawData.shape[1])
sFileName=sFileDir + '/' + sInputFileName
RawData.to_csv(sFileName, index = False)
TestData=RawData.dropna(axis=1, how='any')
print('## Test Data Values')
print(TestData)
print('## Data Profile')
print('Rows :',TestData.shape[0])
print('Columns :',TestData.shape[1])
sFileName=sFileDir + '/' + sOutputFileName
TestData.to_csv(sFileName, index = False)
print('### Done!! #####################')
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@-

iii. Keep Only the Rows That Contain a Maximum of Two Missing Values

Code: -
import sys
import os
import pandas as pd
sInputFileName='Good-or-Bad.csv'
sOutputFileName='Good-or-Bad-01.csv'
sFileDir= 'C:/Ab/VKHCG/'
if not os.path.exists(sFileDir):
    os.makedirs(sFileDir)
sFileName='C:/Ab/VKHCG/' + sInputFileName
RawData=pd.read_csv(sFileName,header=0)
print('## Raw Data Values')
print(RawData)
print('## Data Profile')
print('Rows :',RawData.shape[0])
print('Columns :',RawData.shape[1])
sFileName=sFileDir + '/' + sInputFileName
RawData.to_csv(sFileName, index = False)
TestData=RawData.dropna(thresh=2)
print('## Test Data Values')
print(TestData)
print('## Data Profile')
print('Rows :',TestData.shape[0])
print('Columns :',TestData.shape[1])
sFileName=sFileDir + '/' + sOutputFileName
TestData.to_csv(sFileName, index = False)
print('### Done!! #####################')
---------------------------------------------------------------


B) Aim: - Write Python program to create the network routing diagram from the given data.

Code: -

import sys
import os
import pandas as pd
pd.options.mode.chained_assignment = None

Base= 'F:/VSIT/MscIT/Part-1/SEM-I/DS/Practs'
sInputFileName1='/DocSalDetails.csv'
sInputFileName2='/DocTimeDetails.csv'
sInputFileName3='/DocDetails.csv'
sOutputFileName='/DocDetailsST.csv'
sFileName=Base + sInputFileName1
print('Loading :',sFileName)
SalData=pd.read_csv(sFileName,header=0,low_memory=False, encoding="latin-1")
print('Loaded Salary Data:',SalData.columns.values)
print('################################')

print('Changed :',SalData.columns.values)
SalData.rename(columns={'DocName': 'Doc_Name'}, inplace=True)
SalData.rename(columns={'DocSalary': 'Doc_Salary'}, inplace=True)
SalData.drop('DocDep', axis=1, inplace=True)
print('To :',SalData.columns.values)
print('################################')

sFileName=Base + sInputFileName2
print('Loading :',sFileName)
TimeData=pd.read_csv(sFileName,header=0,low_memory=False, encoding="latin-1")
print('Loaded Time Data:',TimeData.columns.values)
print('################################')

print('Changed :',TimeData.columns.values)
TimeData.rename(columns={'StartTime': 'Start_Time'}, inplace=True)
TimeData.rename(columns={'EndTime': 'End_Time'}, inplace=True)
print('To :',TimeData.columns.values)
print('################################')

sFileName=Base + sInputFileName3
print('Loading :',sFileName)
DocRawData=pd.read_csv(sFileName,header=0,low_memory=False, encoding="latin-1")
DocNetworkData=pd.merge(
    SalData,
    TimeData,
    how='inner',
    on='DocId'
)
################################################################
print('################################')

print('Change ',DocNetworkData.columns.values)
for i in DocNetworkData.columns.values:
    j='Doc_'+i
    DocNetworkData.rename(columns={i: j}, inplace=True)
print('To ', DocNetworkData.columns.values)
print('################################')

sFileDir=Base
if not os.path.exists(sFileDir):
    os.makedirs(sFileDir)
sFileName=sFileDir + '/' + sOutputFileName
print('Storing :', sFileName)
DocNetworkData.to_csv(sFileName, index = False, encoding="latin-1")
print('### Done!! #####################')

@@@@@@@@@@@@@@@@@?@?@??@@?@@CONTINUE@@@@@@@@@@@@@@@@@@@@@@@@

Code: -
import sys
import os
import pandas as pd
pd.options.mode.chained_assignment = None

Base= 'F:/VSIT/MscIT/Part-1/SEM-I/DS/Practs'
sInputFileName='/DocDetails.csv'
sOutputFileName='/DocDetailsNode.csv'
sFileName=Base + sInputFileName
print('Loading :',sFileName)
IPData=pd.read_csv(sFileName,header=0,low_memory=False, encoding="latin-1")
print('Loaded IP :', IPData.columns.values)
print('################################')

print('Changed :',IPData.columns.values)
IPData.drop('DocId', axis=1, inplace=True)
IPData.drop('StartTime', axis=1, inplace=True)
IPData.drop('EndTime', axis=1, inplace=True)
IPData.drop('NoOfPatients', axis=1, inplace=True)
IPData.rename(columns={'DocName': 'Doc_Name'}, inplace=True)
IPData.rename(columns={'DocDept': 'Doc_Dept'}, inplace=True)
IPData.rename(columns={'DocSalary': 'Doc_Salary'}, inplace=True)
print('To :',IPData.columns.values)
print('################################')

print('Change ',IPData.columns.values)
for i in IPData.columns.values:
    j='Node_'+i
    IPData.rename(columns={i: j}, inplace=True)
print('To ', IPData.columns.values)
print('################################')

sFileDir=Base
if not os.path.exists(sFileDir):
    os.makedirs(sFileDir)
sFileName=sFileDir + '/' + sOutputFileName
print('################################')

print('Storing :', sFileName)
print('################################')

IPData.to_csv(sFileName, index = False, encoding="latin-1")

-------------------------------------------------------------------

C) Aim: - Write a Python / R program to build directed acyclic graph

Code: -

import networkx as nx
import matplotlib.pyplot as plt
import sys
import os
import pandas as pd

Base= 'F:/VSIT/MscIT/Part-1/SEM-I/DS/Practs'
sInputFileName='/DocDetails.csv'
sOutputFileName1='DocDAG-1.png'
sOutputFileName2='DocDAG-2.png'
sFileName=Base + sInputFileName
print('Loading :',sFileName)
print('################################')

DocData=pd.read_csv(sFileName,header=0,low_memory=False, encoding="latin-1")
print('Loaded Doctor Data :',DocData.columns.values)
print('################################')

print(DocData)
print('################################')
print('Rows : ',DocData.shape[0])
print('################################')

G1=nx.DiGraph()
G2=nx.DiGraph()

for i in range(DocData.shape[0]):
    G1.add_node(DocData['DocDep'][i])
    sDepName= DocData['DocDep'][i] + '-' + DocData['DocDep'][i]
    G2.add_node(sDepName)
print('################################')

for n1 in G1.nodes():
    for n2 in G1.nodes():
        if n1 != n2:
            print('Link :',n1,' to ', n2)
            G1.add_edge(n1,n2)
print("Nodes of graph: ")
print(G1.nodes())
print("Edges of graph: ")
print(G1.edges())
print('################################')

sFileDir=Base
if not os.path.exists(sFileDir):
    os.makedirs(sFileDir)
sFileName=sFileDir + '/' + sOutputFileName1
print('Storing :', sFileName)
nx.draw(G1,pos=nx.spectral_layout(G1),
nodecolor='r',edge_color='g',
with_labels=True,node_size=8000,
font_size=12)
plt.savefig(sFileName) # save as png
plt.show() # display
print('################################')

for n1 in G2.nodes():
    for n2 in G2.nodes():
        if n1 != n2:
            print('Link :',n1,' to ', n2)
            G2.add_edge(n1,n2)
print("Nodes of graph: ")
print(G2.nodes())
print("Edges of graph: ")
print(G2.edges())
print('################################')

sFileDir=Base
if not os.path.exists(sFileDir):
    os.makedirs(sFileDir)
sFileName=sFileDir + '/' + sOutputFileName2
print('Storing :', sFileName)
print('################################')

nx.draw(G2,pos=nx.spectral_layout(G2),
nodecolor='r',edge_color='b',
        with_labels=True,node_size=8000,
        font_size=12)
plt.savefig(sFileName)
plt.show()

-----------------------------------------------------------------

Process Superstep

Aim: - Process Location

Code: -

import sys
import os
import pandas as pd
import sqlite3 as sq
from pandas.io import sql
import uuid
Base='C:/VKHCG'
print('Working Base :',Base, ' using ', sys.platform)
Company='01-Vermeulen'
InputAssessGraphName='Assess_All_Animals.gml'
EDSAssessDir='02-Assess/01-EDS'
InputAssessDir=EDSAssessDir + '/02-Python'
sFileAssessDir=Base + '/' + Company + '/' + InputAssessDir
if not os.path.exists(sFileAssessDir):
    os.makedirs(sFileAssessDir)
sDataBaseDir=Base + '/' + Company + '/03-Process/SQLite'
if not os.path.exists(sDataBaseDir):
    os.makedirs(sDataBaseDir)
sDatabaseName=sDataBaseDir + '/Vermeulen.db'
conn1 = sq.connect(sDatabaseName)
sDataVaultDir=Base + '/88-DV'
if not os.path.exists(sDataBaseDir):
    os.makedirs(sDataBaseDir)
sDatabaseName=sDataVaultDir + '/datavault.db'
conn2 = sq.connect(sDatabaseName)
t=0
tMax=360*180
for Longitude in range(-180,180,10):
    for Latitude in range(-90,90,10):
        t+=1
        IDNumber=str(uuid.uuid4())
        LocationName='L'+format(round(Longitude,3)*1000, '+07d') +\
                      '-'+format(round(Longitude,3)*1000, '+07d')
#LocationName='L'+str(Longitude)+'-'+str(Longitude)
        print('Create:',t,' of ',tMax,':',LocationName)
        LocationLine=[('ObjectBaseKey', ['GPS']),('IDNumber', [IDNumber]),('LocationNumber', [str(t)]),('LocationName', [LocationName]),('Longitude', [Longitude]),('Latitude', [Latitude])]
        if t==1:
            LocationFrame = pd.DataFrame.from_items(LocationLine)
        else:
            LocationRow = pd.DataFrame.from_items(LocationLine)
            LocationFrame = LocationFrame.append(LocationRow)
LocationHubIndex=LocationFrame.set_index(['IDNumber'],inplace=False)
sTable = 'Process-Location'
print('Storing :',sDatabaseName,' Table:',sTable)
LocationHubIndex.to_sql(sTable, conn1, if_exists="replace")
sTable = 'Hub-Location'
print('Storing :',sDatabaseName,' Table:',sTable)
#LocationHubIndex.to_sql(sTable, conn2, if_exists="replace")
print('Vacuum Databases')
sSQL="VACUUM;"
sql.execute(sSQL,conn1)
#sql.execute(sSQL,conn2)
print('################')
print('### Done!! ############################################')

--------------------------------------------------------------------------------------

Aim: - Process Event

Code: -

import sys
import os
import pandas as pd
import sqlite3 as sq
from pandas.io import sql
Base='C:/VKHCG'
print('Working Base :',Base, ' using ', sys.platform)
Company='01-Vermeulen'
InputFileName='Action_Plan.csv'
sDataBaseDir=Base + '/' + Company + '/03-Process/SQLite'
if not os.path.exists(sDataBaseDir):
    os.makedirs(sDataBaseDir)
sDatabaseName=sDataBaseDir + '/Vermeulen.db'
conn1 = sq.connect(sDatabaseName)
sDataVaultDir=Base + '/88-DV'
if not os.path.exists(sDataBaseDir):
    os.makedirs(sDataBaseDir)
sDatabaseName=sDataVaultDir + '/datavault.db'
conn2 = sq.connect(sDatabaseName)
sFileName=Base + '/' + Company + '/00-RawData/' + InputFileName
print('Loading :',sFileName)
EventRawData=pd.read_csv(sFileName,header=0,low_memory=False, encoding="latin-1")
EventRawData.index.names=['EventID']
EventHubIndex=EventRawData
sTable = 'Process-Event'
print('Storing :',sDatabaseName,' Table:',sTable)
EventHubIndex.to_sql(sTable, conn1, if_exists="replace")
sTable = 'Hub-Event'
print('Storing :',sDatabaseName,' Table:',sTable)
#EventHubIndex.to_sql(sTable, conn2, if_exists="replace")
print('################')
print('Vacuum Databases')
sSQL="VACUUM;"
sql.execute(sSQL,conn1)
#sql.execute(sSQL,conn2)
print('### Done!! ############################################')
-----------------------------------------------------------------------------------------


Aim: - Sun model

Code: -

import sys
import os
from datetime import datetime
from pytz import timezone
import pandas as pd
import sqlite3 as sq
import uuid
pd.options.mode.chained_assignment = None
sDatabaseName='E:/Data Science/Vermeulen.db'
conn1 = sq.connect(sDatabaseName)
sDatabaseName='E:/Data Science/datavault.db'
conn2 = sq.connect(sDatabaseName)
sDatabaseName='E:/Data Science/datawarehouse.db'
conn3 = sq.connect(sDatabaseName)
print('\n#################################')
sSQL=" SELECT DateTimeValue FROM [Hub-Time-Gunnarsson];"
DateDataRaw=pd.read_sql_query(sSQL, conn2)
DateData=DateDataRaw.head(1000)
print(DateData,'thisis Data')
print('Time Dimension')
print('\n#################################')
t=0
mt=DateData.shape[0]
for i in range(mt):
    BirthZone = ('Atlantic/Reykjavik','Europe/London','UCT')
    for j in range(len(BirthZone)):
        t+=1
        print(t,mt*3)
        BirthDateZoneStr=DateData[DateTimeKey]
        BirthDateLocal=DateData[DateTimeValue]
        BirthZone='UCT'
        IDTimeNumber=str(uuid.uuid4())
        TimeLine=[('TimeID', [str(IDTimeNumber)]),
                  ('UTCDate', [str(BirthDateZoneStr)]),
                  ('LocalTime', [str(BirthDateLocal)]),
                  ('TimeZone', [str(BirthZone)])]
        if t==1:
            TimeFrame = pd.DataFrame.from_items(TimeLine)
        else:
            TimeRow = pd.DataFrame.from_items(TimeLine)
            TimeFrame=TimeFrame.append(TimeRow)
DimTime=TimeFrame
DimTimeIndex=DimTime.set_index(['TimeID'],inplace=False)
sTable = 'Dim-Time'
print('\n#################################')
print('Storing :',sDatabaseName,'\n Table:',sTable)
print('\n#################################')
DimTimeIndex.to_sql(sTable, conn1, if_exists="replace")
DimTimeIndex.to_sql(sTable, conn3, if_exists="replace")
sSQL=" SELECT " + \
      " FirstName," + \
      " SecondName," + \
      " LastName," + \
      " BirthDateKey " + \
      " FROM [Hub-Person];"
PersonDataRaw=pd.read_sql_query(sSQL, conn2)
PersonData=PersonDataRaw.head(1000)
print('\n#################################')
print('Dimension Person')
print('\n#################################')
t=0
mt=DateData.shape[0]
for i in range(mt):
    t+=1
    print(t,mt)
    FirstName = str(PersonData["FirstName"])
    SecondName = str(PersonData["SecondName"])
    if len(SecondName) > 0:
        SecondName=""
LastName = str(PersonData["LastName"])
BirthDateKey = str(PersonData["BirthDateKey"])
IDPersonNumber=str(uuid.uuid4())
PersonLine=[('PersonID', [str(IDPersonNumber)]),
            ('FirstName', [FirstName]),
            ('SecondName', [SecondName]),
            ('LastName', [LastName]),
            ('Zone', [str('UTC')]),
            ('BirthDate', [BirthDateKey])]
if t==1:
    PersonFrame = pd.DataFrame.from_items(PersonLine)
else:
    PersonRow = pd.DataFrame.from_items(PersonLine)
    PersonFrame = PersonFrame.append(PersonRow)
DimPerson=PersonFrame
print(DimPerson)
DimPersonIndex=DimPerson.set_index(['PersonID'],inplace=False)
sTable = 'Dim-Person'
print('\n#################################')
print('Storing :',sDatabaseName,'\n Table:',sTable)
DimPersonIndex.to_sql(sTable, conn1, if_exists="replace")
DimPersonIndex.to_sql(sTable, conn3, if_exists="replace")

--------------------------------------------------------------------------------------

Organize Superstep

A) Aim: - Organizing Data

Code: -

import sys
import os
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None
sFileName='C:\\VKHCG\\01-Vermeulen\\02-Assess\\01-EDS\\02-Python\\Assess-Network-Routing-Company1.csv'
print('Loading :',sFileName)
CompanyData=pd.read_csv(sFileName,header=0,low_memory=False, encoding="latin-1")
print(CompanyData.head())
print(CompanyData.shape)
G=nx.Graph()
for i in range(CompanyData.shape[0]):
   for j in range(CompanyData.shape[0]):
     Node0=CompanyData['Company_Country_Name'][i]
     Node1=CompanyData['Company_Country_Name'][j]
     if Node0 != Node1:
        G.add_edge(Node0,Node1)
   for i in range(CompanyData.shape[0]):
    Node0=CompanyData['Company_Country_Name'][i]
    Node1=CompanyData['Company_Place_Name'][i] + '('+ CompanyData['Company_Country_Name'][i] + ')'
    if Node0 != Node1:
       G.add_edge(Node0,Node1)
print('Nodes:', G.number_of_nodes())
print('Edges:', G.number_of_edges())
sFileName='C:\\VKHCG\\01-Vermeulen\\02-Assess\\01-EDS\\02-Python\\Assess-Network-Routing-Company1.csv'
print('Storing :',sFileName)
#nx.write_gml(G, sFileName)
sFileName='C:\\VKHCG\\01-Vermeulen\\02-Assess\\01-EDS\\02-Python\\Assess-Network-Routing-Company1.csv'
print('Storing Graph Image:',sFileName)
plt.figure(figsize=(15, 15))
pos=nx.spectral_layout(G,dim=2)
nx.draw_networkx_nodes(G,pos, node_color='k', node_size=10, alpha=0.8)
nx.draw_networkx_edges(G, pos,edge_color='r', arrows=False, style='dashed')
nx.draw_networkx_labels(G,pos,font_size=12,font_family='sans-serif',font_color='b')
plt.axis('off')
#plt.savefig(sFileName,dpi=600)
plt.show()
print('### Done!! #####################')

----------------------------------------------------------------------------------------

B) Aim: - Organizing Data horizontally

Code: -

import sys
import os
import pandas as pd
import sqlite3 as sq
sDatabaseName='C:/Users/exam/Downloads/datawarehouse.db'
conn1 = sq.connect(sDatabaseName)
sDatabaseName= 'C:/Users/exam/Downloads/datamart.db'
conn2 = sq.connect(sDatabaseName)
sTable = 'Dim-BMI'
print('Loading :',sDatabaseName,' Table:',sTable)
sSQL="SELECT * FROM [Dim-BMI];"
PersonFrame0=pd.read_sql_query(sSQL, conn1)
sTable = 'Dim-BMI'
print('Loading :',sDatabaseName,' Table:',sTable)
sSQL="SELECT PersonID,\
    Height,\
    Weight,\
    bmi,\
    Indicator\
    FROM [Dim-BMI]\
    WHERE \
    Height > 1.5 \
    and Indicator = 1\
    ORDER BY \
    Height,\
    Weight;"
PersonFrame1=pd.read_sql_query(sSQL, conn1)
DimPerson=PersonFrame1
DimPersonIndex=DimPerson.set_index(['PersonID'],inplace=False)
sTable = 'Dim-BMI-Horizontal'
print('Storing :',sDatabaseName,'\n Table:',sTable)
DimPersonIndex.to_sql(sTable, conn2, if_exists="replace")
sTable = 'Dim-BMI-Horizontal'
print('Loading :',sDatabaseName,' Table:',sTable)
sSQL="SELECT * FROM [Dim-BMI-Horizontal];"
PersonFrame2=pd.read_sql_query(sSQL, conn2)
print('Full Data Set (Rows):', PersonFrame0.shape[0])
print('Full Data Set (Columns):', PersonFrame0.shape[1])
print('Horizontal Data Set (Rows):', PersonFrame2.shape[0])
print('Horizontal Data Set (Columns):', PersonFrame2.shape[1])


---------------------------------------------------------------------------------------------

Aim: - Organizing Data Vertically

Code: -

import sys
import os
import pandas as pd
import sqlite3 as sq
sDatabaseName='C:/Users/exam/Downloads/datawarehouse.db'
conn1 = sq.connect(sDatabaseName)
sDatabaseName='C:/Users/exam/Downloads/datamart.db'
conn2 = sq.connect(sDatabaseName)
sTable = 'Dim-BMI'
print('Loading :',sDatabaseName,' Table:',sTable)
sSQL="SELECT * FROM [Dim-BMI];"
PersonFrame0=pd.read_sql_query(sSQL, conn1)
sTable = 'Dim-BMI'
print('Loading :',sDatabaseName,' Table:',sTable)
sSQL="SELECT \
    Height,\
    Weight,\
    Indicator\
    FROM [Dim-BMI];"
PersonFrame1=pd.read_sql_query(sSQL, conn1)
DimPerson=PersonFrame1
DimPersonIndex=DimPerson.set_index(['Indicator'],inplace=False)
sTable = 'Dim-BMI-Vertical'
print('Storing :',sDatabaseName,'\n Table:',sTable)
DimPersonIndex.to_sql(sTable, conn2, if_exists="replace")
sTable = 'Dim-BMI-Vertical'
print('Loading :',sDatabaseName,' Table:',sTable)
sSQL="SELECT * FROM [Dim-BMI-Vertical];"
PersonFrame2=pd.read_sql_query(sSQL, conn2)
print('Full Data Set (Rows):', PersonFrame0.shape[0])
print('Full Data Set (Columns):', PersonFrame0.shape[1])
print('Horizontal Data Set (Rows):', PersonFrame2.shape[0])
print('Horizontal Data Set (Columns):', PersonFrame2.shape[1])

----------------------------------------------------------------------------------------------


D) Aim: - Organize Island

Code: -

import sys
import os
import pandas as pd
import sqlite3 as sq
sDatabaseName='C:/Users/exam/Downloads/datawarehouse.db'
conn1 = sq.connect(sDatabaseName)
################################################################
sDatabaseName='C:/Users/exam/Downloads/datamart(1).db'
conn2 = sq.connect(sDatabaseName)
################################################################
print('################')
sTable = 'Dim-BMI'
print('Loading :',sDatabaseName,' Table:',sTable)
sSQL="SELECT * FROM [Dim-BMI];"
PersonFrame0=pd.read_sql_query(sSQL, conn1)
################################################################
print('################')
sTable = 'Dim-BMI'
print('Loading :',sDatabaseName,' Table:',sTable)
sSQL="SELECT \
    Height,\
    Weight,\
    Indicator\
    FROM [Dim-BMI]\
    WHERE Indicator > 2\
    ORDER BY \
    Height,\
    Weight;"
PersonFrame1=pd.read_sql_query(sSQL, conn1)
################################################################
DimPerson=PersonFrame1
DimPersonIndex=DimPerson.set_index(['Indicator'],inplace=False)
################################################################
sTable = 'Dim-BMI-Vertical'
print('\n#################################')
print('Storing :',sDatabaseName,'\n Table:',sTable)
print('\n#################################')
DimPersonIndex.to_sql(sTable, conn2, if_exists="replace")
################################################################
print('################################')
sTable = 'Dim-BMI-Vertical'
print('Loading :',sDatabaseName,' Table:',sTable)
print('################################')
sSQL="SELECT * FROM [Dim-BMI-Vertical];"
PersonFrame2=pd.read_sql_query(sSQL, conn2)
################################################################
print('################################')
print('Full Data Set (Rows):', PersonFrame0.shape[0])
print('Full Data Set (Columns):', PersonFrame0.shape[1])
print('################################')
print('Horizontal Data Set (Rows):', PersonFrame2.shape[0])
print('Horizontal Data Set (Columns):', PersonFrame2.shape[1])


--------------------------------------------------------------------------------------------------------

Report Superstep


Aim: - Generating Data
Code: -

import sys
import os
import pandas as pd
import matplotlib as ml
from matplotlib import pyplot as plt
data=[
    ['London', 29.2, 17.4],
    ['Glasgow', 18.8, 11.3],
    ['Cape Town', 15.3, 9.0],
    ['Houston', 22.0, 7.8],
    ['Perth', 18.0, 23.7],
    ['San Francisco', 11.4, 33.3]]
os_new=pd.DataFrame(data)
pd.Index(['Item', 'Value', 'Value Percent', 'Conversions', 'Conversion Percent',
          'URL', 'Stats URL'],
         dtype='object')
os_new.rename(columns = {0 : "Warehouse Location"}, inplace=True)
os_new.rename(columns = {1 : "Profit 2016"}, inplace=True)
os_new.rename(columns = {2 : "Profit 2017"}, inplace=True)
explode = (0, 0.2, 0, 0, 0, 0.1)
labels=os_new['Warehouse Location']
colors_mine = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral', 'lightcyan','lightblue']
os_new.plot(figsize=(10, 10),kind="pie", y="Profit 2017",autopct='%.2f%%', \
            shadow=True, explode=explode, legend = False, colors = colors_mine,\
            labels=labels, fontsize=20)
sPicNameOut1='E:/Data Science/pie_explode.png'
plt.savefig(sPicNameOut1,dpi=600)
plt.show()
os_new.iloc[:5].plot(figsize=(10, 10),kind='line',x='Warehouse Location',\
                     y=['Profit 2016','Profit 2017']);
plt.show()
os_new.iloc[:5].plot(figsize=(10, 10),kind='bar',x='Warehouse Location',\
                     y=['Profit 2016','Profit 2017']);
plt.show()
os_new.iloc[:5].plot(figsize=(10, 10),kind='barh',x='Warehouse Location',\
                     y=['Profit 2016','Profit 2017']);
plt.show()
os_new.iloc[:5].plot(figsize=(10, 10),kind='area',x='Warehouse Location',\
                     y=['Profit 2016','Profit 2017'],stacked=False);
plt.show()
os_new.iloc[:5].plot(figsize=(10, 10),kind='scatter',x='Profit 2016',\
                     y='Profit 2017',color='DarkBlue',marker='D');
plt.show()
os_new.iloc[:5].plot(figsize=(13, 10),kind='hexbin',x='Profit 2016',\
                     y='Profit 2017', gridsize=25);
plt.show()

-----------------------------------------------------------------------------------------------------