# Untitled

Jul 14th, 2023
326
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
1. library(corrplot)
2. library(ggplot2)
3. library(reshape2)
4.
5.
6. # Set a seed for reproducibility of results
7. set.seed(123)
8.
9. # Simulate the sizes of houses and their prices
10. size <- rnorm(100, mean = 1500, sd = 500) # Size of houses in square feet
11. price <- 50000 + 100 * size + rnorm(100, mean = 0, sd = 50000) # Prices of houses. The '50000 + 100 * size' part simulates a linear relationship between size and price, and the 'rnorm()' part simulates random noise.
12.
13. # Create a dataframe combining the sizes and corresponding prices
14. df <- data.frame(size, price)
15.
16. # Calculate and print correlation between size and price
17. correlation <- cor(df\$size, df\$price)
18. cat("Correlation between size and price: ", correlation, "\n")
19.
20. # Generate a scatter plot
21. plot(df\$size, df\$price, main = "Scatter plot of size vs price", xlab = "Size", ylab = "Price", pch = 19)
22.
23. # Generate a correlation matrix
24. corr_matrix <- cor(df)
25. print(corr_matrix)
26.
27. # Generate a heatmap of correlations
28. corrplot(corr_matrix, method = "color", type = "upper", order = "hclust",
30.          tl.col = "black", # Text label color
31.          tl.srt = 45) # Text label rotation
32.
33. # We are going to use a linear regression model to analyze the relationship between house size (independent variable) and its price (dependent variable).
34. # Null Hypothesis (H0): There is no linear relationship between house size and its price (the slope is zero).
35. # Alternative Hypothesis (H1): There is a linear relationship between house size and its price (the slope is not zero).
36.
37. # Fit a linear regression model to the data
38. model <- lm(price ~ size, data = df) # 'price' is the dependent variable and 'size' is the independent variable
39.
40. # Generate a scatter plot
41. plot(df\$size, df\$price, main = "Scatter plot of size vs price", xlab = "Size", ylab = "Price", pch = 19)
42. abline(model, col = "red") # Add regression line to the plot
43.
44.
45. # Print a summary of the regression model, which includes the coefficients, the R-squared value, and the p-value
46. summary(model)
47.
48. # Interpretation Hints:
49. # 1. Look at the 'Estimate' for 'size' in the 'Coefficients' table. This is the slope of the regression line, and it indicates the change in house price for each one-unit increase in house size. If the p-value associated with this estimate is less than 0.05, then the relationship is significant, and we can reject the null hypothesis.
50. # 2. The '(Intercept)' in the 'Coefficients' table is the y-intercept of the regression line, which is the predicted price when the size is zero.
51. # 3. The 'Residuals' section gives you information about the distribution of the residuals, which should ideally be normally distributed. Look for any large deviations in these values.
52. # 4. The 'R-squared' value indicates the proportion of variance in the dependent variable that can be explained by the independent variable(s). The closer this value is to 1, the better the fit of the model.
53.
54. # Predict values based on the model
55. predicted_prices <- predict(model, df)
56. print("Predicted prices based on the model:")
57. print(predicted_prices)
58.
59. # Generate predicted values
60. predicted_prices <- predict(model, df)
61.
62. # Create a data frame for plotting
63. plot_df <- data.frame(Actual = df\$price, Predicted = predicted_prices)
64.
65. # Generate the plot of actual price v/s predicted price
66. ggplot(data = plot_df, aes(x = Actual, y = Predicted)) +
67.   geom_point() +
68.   geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
69.   labs(title = "Actual vs Predicted Prices",
70.        x = "Actual Price",
71.        y = "Predicted Price") +
72.   theme_minimal()
73.
74. # Show residuals of the model
75. residuals <- resid(model)
76. print("Residuals of the model:")
77. print(residuals)
78.
79. # Plot residuals to check if they're normally distributed
80. hist(residuals, main = "Histogram of Residuals", xlab = "Residuals", col = "lightblue")