Untitled

# Assignment 1
# Section 2

# Clear all objects currently in memory
rm(list=ls())

-----------------------------------------------------------------------------------
# Q2.1

# Set the working directory
setwd("C:/Users/Vivien/Desktop/Third Year/Semester 1/Econometrics 2/Assignments/Assignment 1")

# Import the dataset and print it to the screen
studies_df=read.csv("A1_Data.csv")

# Give a variable a name that we can use again
assg=studies_df$assg
exam=studies_df$exam

# Convert assg(i) and exam(i) into percent (creating a new column in table)
studies_df$AssgPercent=(assg/30)*100
studies_df$ExamPercent=(exam/70)*100

------------------------------------------------------------------------------------
  # Obtain descriptive statistics for assg(i) and exam(i); non-percentage form

  # Get a table of summary statistics (assg(i))
  statstable1 = rbind(mean(assg),           # Specify mean
                      median(assg),          # Specify median
                      sd(assg),              # Specify standard deviation
                      min(assg),             # Specify minimum
                      max(assg))             # Specify maximum

# Give names to each of the rows in the stats table
rownames(statstable1) = c("Mean","Median","SD","Min","Max")

# Give the first column of the stats table a name
colnames(statstable1) = "Assignment Mark"

# Print table of summary statistics to 4d.p
print(round(statstable1,4))

# Get a table of summary statistics (exam(i))
statstable2 = rbind(mean(exam),            # Specify mean
                    median(exam),          # Specify median
                    sd(exam),              # Specify standard deviation
                    min(exam),             # Specify minimum
                    max(exam))             # Specify maximum

# Give names to each of the rows in the stats table
rownames(statstable2) = c("Mean","Median","SD","Min","Max")

# Give the first column of the stats table a name
colnames(statstable2) = "Exam Mark"

# Print table of summary statistics to 4d.p
print(round(statstable2,4))

# Obtain histogram for assg(i)
hist(studies_df$assg)

# Produce a histogram, with some fancier colors
hist(studies_df$assg,                                         # Specify dataset
     main   = "Assignment Mark",                              # Title of histogram
     xlab   = "Total Mark Across Four Assignments",           # Name for x-axis
     breaks = 20,                                             # More breakpoints in the histogram
     col    = "yellow")                                       # Yellow coloured histogram!

# Obtain histogram for exam(i)
hist(studies_df$exam)

# Produce a histogram, with some fancier colors
hist(studies_df$exam,                                         # Specify dataset
     main   = "Exam Mark",                                    # Title of histogram
     xlab   = "Final Exam Mark",                              # Name for x-axis
     breaks = 20,                                             # More breakpoints in the histogram
     col    = "pink")                                         # Pink coloured histogram!

-----------------------------------------------------------------------------------
# Obtain descriptive statistics for assg(i) and exam(i); percentage form

# Give a variable a name that we can use again
assg2=studies_df$AssgPercent
exam2=studies_df$ExamPercent

# Get a table of summary statistics (assg(i))
statstable3 = rbind(mean(assg2),            # Specify mean
                    median(assg2),          # Specify median
                    sd(assg2),              # Specify standard deviation
                    min(assg2),             # Specify minimum
                    max(assg2))             # Specify maximum

# Give names to each of the rows in the stats table
rownames(statstable3) = c("Mean","Median","SD","Min","Max")

# Give the first column of the stats table a name
colnames(statstable3) = "Assignment Mark (%)"

# Print table of summary statistics to 4d.p
print(round(statstable3,3))

# Get a table of summary statistics (exam(i))
statstable4 = rbind(mean(exam2),            # Specify mean
                    median(exam2),          # Specify median
                    sd(exam2),              # Specify standard deviation
                    min(exam2),             # Specify minimum
                    max(exam2))             # Specify maximum

# Give names to each of the rows in the stats table
rownames(statstable4) = c("Mean","Median","SD","Min","Max")

# Give the first column of the stats table a name
colnames(statstable4) = "Exam Mark (%)"

# Print table of summary statistics to 4d.p
print(round(statstable4,3))

# Obtain histogram for assg(i)
hist(studies_df$AssgPercent)

# Produce a histogram, with some fancier colors
hist(studies_df$AssgPercent,                                         # Specify dataset
     main   = "Assignment Mark",                                     # Title of histogram
     xlab   = "Total Percentage Mark Across Four Assignments",       # Name for x-axis

     col    = "yellow")                                              # Yellow coloured histogram!

# Obtain histogram for exam(i)
hist(studies_df$ExamPercent)

# Produce a histogram, with some fancier colors
hist(studies_df$ExamPercent,                                         # Specify dataset
     main   = "Exam Mark",                                           # Title of histogram
     xlab   = "Final Percentage Exam Mark",                          # Name for x-axis
     breaks = 20,                                                    # More breakpoints in the histogram
     col    = "pink")                                                # Pink coloured histogram!

# The histogram for 'Assignment Mark' is has a long left tail and is hence negatively skewed. This makes sense since the Mean < Median.

# The histogram for 'Exam Mark'is more symmetric and is possibly close to a normal distribution (ignorning for the outlier on the left). This makes sense since the Mean ~~ Median.

------------------------------------------------------------------------------------
# Q2.2

# Run an OLS regression
eqn1=lm(ExamPecent~AssgPercent,data=studies_df)
print(summary(eqn1))

# Statistical interpretation
# Intercept
# The statistical interpretation of B0 is for a student who obtains an overall mark of 0 for the assignments, on average the students' overall exam mark will be 26.3553 (out of 70).

# Coefficient of assg
# The statistical interpretation of B1 is a 1 mark increase in the assignment mark will on average result in a 0.7258 increase in the overall exam mark.

# Causal interpretation
# Intercept
# The causal interpretation of B0 is for a student who obtains an overall mark of 0 for the assignments, it will cause the student to have an overall exam mark of 26.3553 (out of 70).

# Coefficient of assg
# The causal interpretation of B1 is a 1 mark increase in the assignment mark will cause a 0.7258 increase in the overall exam mark.

------------------------------------------------------------------------------------
# Q2.3

# Removing all cases where flag.0(i)=1 (removing cases where a student obtained a 0 for an assessment)
eqn2=lm(ExamPercent~AssgPercent,data=subset(studies_df,flag.0==0))
print(summary(eqn2))

# The coefficient of assg(i) has increased by 0.2121 from 0.7258 to 0.9379.
# This indicates that for students who completed all assessments/obtained a mark greater than zero
# their marginal benefit of an extra mark on their assignments on the overall exam mark is 0.2121
# more than a student who had failed/not completed an assessment.

# I believe that this is a sensible/innocuous decision because it will allow us to obtain a more accurate
# causal interpretation of obtaining an extra assignment mark on the overall exam mark
# Ultimately, we are interested in seeing the correlation/relationship between assignment marks and the overall exam mark.
# Including students who had failed or did not complete an assessment would serve
# as an outlier in our regression analysis and would impact the accuracy of our analysis.

------------------------------------------------------------------------------------
# Q2.4

  # No, I do not believe that the regression model in Q2.3 suffers from simultaenous causality.
  # Though it is plausible to say that a change in the assignment mark will cause a change in the overall exam mark
  # (e.g. those who work harder do better on the exam/overall)
  # it is not plausible to say a change in the exam mark will affect the assignment mark.
  # This is because at the point of doing the exam, all students would have completed their assignments
  # and would have already received finalised assignment marks (cannot be changed due to a change in exam mark).

  ------------------------------------------------------------------------------------
  # Q2.5

  # See notes

  ------------------------------------------------------------------------------------
  # Q2.6

  # If we did not assume assignments are undertaken individually, then it would be
  # difficult to say that there is a relationship between views(i) and assg(i).
  # The level of participation in Ed will not necessarily impact the assg mark when done in a group.
  # Not strictly controlling for individual ability.

  ------------------------------------------------------------------------------------
  # Q2.7

  # Adding views(i) to the regression
eqn3=lm(ExamPercent~AssgPercent+views,data=subset(studies_df,flag.0==0))
print(summary(eqn3))
# Intercept
# A student who achieves an overall assignment mark of 0 and has viewed 0 threads on Ed
# will on average obtain an overall exam mark of 24.0259 (out of 70).
# Not a valid interpretation as we have excluded for students who obtained a 0 on any assessment.
# An overall assignment mark of 0 implies that the student obtained a 0 for all assignments.
# The average mark has increased by 2.5562 (gone from 21.4697 to 24.0259).

# Coefficient of assg
# Controlling for the number of threads the student has viewed on Ed,
# obtaining 1 extra mark on the assignment will on average result in a
# 0.7173 increase in the overall exam mark (marginal benefit of an extra mark on their
# assignments on the overall exam mark)

# Coefficient of views
# Controlling for the total mark over all four assignments, viewing 1 extra thread on Ed
# will on average result in a 0.0093 increase in the overall exam mark.

# Because there is a difference between Delta1 and Beta1, it is clear that OVB problem exists.
# A regression of exam(i) on assg(i) will not have a causal interpretation.
# If we require a causal estimate, then we need to develop a strategy to obtain one.

------------------------------------------------------------------------------------
  # Q2.8