library(tidyverse) Advertising <- read.csv("~/Nextcloud/2022 Análisis de datos para ciencias/Advertising.csv") n = 100 x = runif(n, -2, 2) a = 1 b = 2 e = rnorm(n, 0, 1/2) y = a*x + b + e df = tibble(x = x, y = y) ggplot(df, aes(x = x, y = y)) + geom_point() + geom_smooth(method = 'lm', se = FALSE) + geom_abline(slope = a, intercept = b, color = 'red') regression = lm(data = df, y ~ x) summary(regression) df$epshat = regression$residuals df$yhat = regression$fitted.values ggplot(df, aes(x = epshat)) + geom_histogram(bins = 10) ggplot(df, aes(x = yhat)) + geom_histogram(bins = 10) rango_a = seq(-1, 3, length.out = 100) rango_b = seq(0, 4, length.out = 100) S = function(a, b){ sum((a*x + b - y)^2) } z = matrix(NA, 100, 100) for(i in 1:100){ for(j in 1:100){ z[i,j] = S(rango_a[i], rango_b[j]) } } filled.contour(rango_a, rango_b, z, frame.plot = FALSE, xlab = "a", ylab = "b", main = "Curvas de nivel de S") ggplot(Advertising, aes(x = newspaper, y = sales)) + geom_point() + geom_smooth(method = 'lm', se = FALSE) reg = lm(data = Advertising, sales ~ newspaper) summary(reg) ## 1) sigma grande, n chico n = 50 x = runif(n, -2, 2) # Este dominio también es importante! a = 1 b = 2 e = rnorm(n, 0, 10) y = a*x + b + e df = tibble(x = x, y = y) ggplot(df, aes(x = x, y = y)) + geom_point() + geom_smooth(method = 'lm', se = FALSE) + geom_abline(slope = a, intercept = b, color = 'red') regression = lm(data = df, y ~ x) summary(regression) ## 2) Relación no lineal n = 100 x = runif(n, -2, 2) a = 1 b = 2 e = rnorm(n, 0, 1/2) y = a*(x^2) + b + e df = tibble(x = x, y = y) ggplot(df, aes(x = x, y = y)) + geom_point() + geom_smooth(method = 'lm', se = FALSE) + stat_function(fun = function(x){x^2 + b}, color = 'red') regression = lm(data = df, y ~ x) summary(regression) ## 3) Presencia de datos atípicos n = 100 x = runif(n, -2, 2) a = 1 b = 2 e = rnorm(n, 0, 1/2) y = a*x + b + e x[1] = 100 y[1] = -100 df = tibble(x = x, y = y) ggplot(df, aes(x = x, y = y)) + geom_point() + geom_smooth(method = 'lm', se = FALSE) + geom_abline(slope = a, intercept = b, color = 'red') regression = lm(data = df, y ~ x) summary(regression) ## 4) Errores no normales n = 100 x = runif(n, -2, 2) a = 1 b = 2 e = rexp(n, 2) - 1/2 y = a*x + b + e df = tibble(x = x, y = y) ggplot(df, aes(x = x, y = y)) + geom_point() + geom_smooth(method = 'lm', se = FALSE) + geom_abline(slope = a, intercept = b, color = 'red') regression = lm(data = df, y ~ x) summary(regression) ## 5) Errores no independientes n = 100 x = runif(n, -pi/2, pi/2) a = 1 b = 2 e = rnorm(1, 0, 1/2) + rnorm(n, 0, 1/2) y = a*x + b + e df = tibble(x = x, y = y) ggplot(df, aes(x = x, y = y)) + geom_point() + geom_smooth(method = 'lm', se = FALSE) + geom_abline(slope = a, intercept = b, color = 'red') regression = lm(data = df, y ~ x) summary(regression)