Untitled

#import the relevant libraries
import pandas as pd
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

df=pd.read_csv("Team1_data_Startups.csv") #read the datafile
print(df)
df.dropna()     #drop all rows that have any NaN values
print(df)

pd.set_option('display.max_columns', 15)

#set the categorical variables with the correct datatype
df['number_of_employees']= df['number_of_employees'].astype('category')
df['State']= df['State'].astype('category')
df['AI_Type']= df['AI_Type'].astype('category')
df['funding_status_adjusted']=df['funding_status_adjusted'].astype('category')

#create the linear models

#using the actual funding numbers
linearmodel1 = smf.ols(formula = 'Funding_amount ~ AI_Type', data = df).fit()
print('Funding_amount ~ AI type', "\n")
print(linearmodel1.summary())
print("\n\n\n")

linearmodel2 = smf.ols(formula = 'Funding_amount ~ Patents', data = df).fit()
print('Funding_amount ~ Patents',"\n")
print(linearmodel2.summary())
print("\n\n\n")

linearmodel3 = smf.ols(formula = 'Funding_amount ~ Patents+AI_Type', data = df).fit()
print('Funding_amount ~ Patents + AI_Type',"\n")
print(linearmodel3.summary())
print("\n\n\n")

linearmodel4 = smf.ols(formula = 'Funding_amount ~ Patents+Age', data = df).fit()
print('Funding_amount ~ Patents + Age',"\n")
print(linearmodel4.summary())
print("\n\n\n")

linearmodel5 = smf.ols(formula = 'Funding_amount ~ Patents+AI_Type+Age', data = df).fit()
print('Funding_amount ~ Patents + AI_Type + Age', "\n")
print(linearmodel5.summary())
print("\n\n\n")

linearmodel6 = smf.ols(formula = 'Funding_amount~funding_status_adjusted+Patents+AI_Type+Age', data = df).fit()
print('Funding_amount ~ funding_status + Patents + AI_Type + Age', "\n")
print(linearmodel6.summary())
print("\n\n\n")

linearmodel7 = smf.ols(formula = 'Funding_amount~State+funding_status_adjusted+number_of_employees+Number_of_funding_rounds+Patents+AI_Type+Age', data = df).fit()
print('Funding_amount ~ State + funding_status + number_of_employees + Number_of_funding_rounds + Patents + AI_Type + Age', "\n")
print(linearmodel7.summary())
print("\n\n\n")

#using the funding amount/1,000,000
linearmodel8 = smf.ols(formula = 'Funding_amount_micron ~ AI_Type', data = df).fit()
print('Funding_amount_micron ~ AI_Type', "\n")
print(linearmodel8.summary())
print("\n\n\n")

linearmodel9 = smf.ols(formula = 'Funding_amount_micron ~ Patents', data = df).fit()
print('Funding_amount_micron ~ Patents', "\n")
print(linearmodel9.summary())
print("\n\n\n")

linearmodel10 = smf.ols(formula = 'Funding_amount_micron ~ Patents+AI_Type', data = df).fit()
print('Funding_amount_micron ~ Patents + AI_Type', "\n")
print(linearmodel10.summary())
print("\n\n\n")

linearmodel11 = smf.ols(formula = 'Funding_amount_micron ~ Patents+Age', data = df).fit()
print('Funding_amount_micron ~ Patents + Age', "\n")
print(linearmodel11.summary())
print("\n\n\n")

linearmodel12 = smf.ols(formula = 'Funding_amount_micron ~ Patents+AI_Type+Age', data = df).fit()
print('Funding_amount_micron ~ Patents + AI_Type + Age', "\n")
print(linearmodel12.summary())
print("\n\n\n")

linearmodel13 = smf.ols(formula = 'Funding_amount_micron~funding_status_adjusted+Patents+AI_Type+Age', data = df).fit()
print('Funding_amount_micron ~ funding_status + Patents + AI_Type + Age', "\n")
print(linearmodel13.summary())
print("\n\n\n")

linearmodel14 = smf.ols(formula = 'Funding_amount_micron~State+funding_status_adjusted+number_of_employees+Number_of_funding_rounds+Patents+AI_Type+Age', data = df).fit()
print('Funding_amount_micron ~ State + funding_status + number_of_employees + Number_of_funding_rounds + Patents + AI_Type + Age', "\n")
print(linearmodel14.summary())
print("\n\n\n")

#using the log of the funding amount
linearmodel15 = smf.ols(formula = 'Funding_amount_log ~ Patents', data = df).fit()
print('Funding_amount_log ~ Patents', "\n")
print(linearmodel15.summary())
print("\n\n\n")

linearmodel16 = smf.ols(formula = 'Funding_amount_log ~ AI_Type', data = df).fit()
print('Funding_amount_log ~ AI_Type', "\n")
print(linearmodel16.summary())
print("\n\n\n")

linearmodel17 = smf.ols(formula = 'Funding_amount_log ~ Patents+AI_Type', data = df).fit()
print('Funding_amount_log ~ Patents + AI_Type', "\n")
print(linearmodel17.summary())
print("\n\n\n")

linearmodel18 = smf.ols(formula = 'Funding_amount_log ~ Patents+Age', data = df).fit()
print('Funding_amount_log ~ Patents + Age', "\n")
print(linearmodel18.summary())
print("\n\n\n")

linearmodel19 = smf.ols(formula = 'Funding_amount_log ~ Patents+AI_Type+Age', data = df).fit()
print('Funding_amount_log ~ Patents + AI_Type + Age', "\n")
print(linearmodel19.summary())
print("\n\n\n")

linearmodel20 = smf.ols(formula = 'Funding_amount_log~State+funding_status_adjusted+number_of_employees+Number_of_funding_rounds+Patents+AI_Type+Age', data = df).fit()
print('Funding_amount_log ~ State + funding_status + number_of_employees + Number_of_funding_rounds + Patents + AI_Type + Age', "\n")
print(linearmodel20.summary())
print("\n\n\n")

linearmodelfinal= smf.ols(formula = 'Funding_amount_log~funding_status_adjusted+number_of_employees+Number_of_funding_rounds+Patents+AI_Type+Age', data = df).fit()
print('Funding_amount_log ~ funding_status + number_of_employees + Number_of_funding_rounds + Patents + AI_Type + Age',"\n")
print(linearmodelfinal.summary())
print("\n\n\n")

#create the plots
#scatterplot patents on funding
df.plot.scatter(x='Patents', y='Funding_amount_log', title='Graph showing the number of patents and the funding amount')
plt.show()

#scatterplot patents on age
df.plot.scatter(x='Age', y='Patents',  title='Graph showing the age of the companies and the number of patents')
plt.show()

#scatterplot patents on age
df.plot.scatter(x='Age', y='Funding_amount_log',  title='Graph showing the funding amount and the age of the companies')
plt.show()

#bar graph for AI_Type and mean of funding
var = df.groupby('AI_Type').Funding_amount_micron.mean()
var.plot(kind='bar')
plt.xlabel('AI_Type')
plt.ylabel('Mean of Funding amount micron')
plt.title('AI_Type Vs Mean of Funding amount micron')
plt.show()