library(tidyverse)
Advertising <- read.csv("~/Nextcloud/2022 Análisis de datos para ciencias/Advertising.csv")


n = 100
x = runif(n, -2, 2)
a = 1
b = 2
e = rnorm(n, 0, 1/2)
y = a*x + b + e
df = tibble(x = x, y = y)

ggplot(df, aes(x = x, y = y)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE) +
  geom_abline(slope = a, intercept = b, color = 'red') 

regression = lm(data = df, y ~ x)
summary(regression)

df$epshat = regression$residuals
df$yhat = regression$fitted.values

ggplot(df, aes(x = epshat)) +
  geom_histogram(bins = 10)

ggplot(df, aes(x = yhat)) +
  geom_histogram(bins = 10)

rango_a = seq(-1, 3, length.out = 100)
rango_b = seq(0, 4, length.out = 100)
S = function(a, b){
  sum((a*x + b - y)^2)
}
z = matrix(NA, 100, 100)
for(i in 1:100){
  for(j in 1:100){
    z[i,j] = S(rango_a[i], rango_b[j])
  }
}
filled.contour(rango_a, rango_b, z, frame.plot = FALSE, xlab = "a", ylab = "b", main = "Curvas de nivel de S")

ggplot(Advertising, aes(x = newspaper, y = sales)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE)

reg = lm(data = Advertising, sales ~ newspaper)
summary(reg)

## 1) sigma grande, n chico
n = 50
x = runif(n, -2, 2) # Este dominio también es importante!
a = 1
b = 2
e = rnorm(n, 0, 10)
y = a*x + b + e
df = tibble(x = x, y = y)

ggplot(df, aes(x = x, y = y)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE) +
  geom_abline(slope = a, intercept = b, color = 'red') 

regression = lm(data = df, y ~ x)
summary(regression)

## 2) Relación no lineal
n = 100
x = runif(n, -2, 2)
a = 1
b = 2
e = rnorm(n, 0, 1/2)
y = a*(x^2) + b + e
df = tibble(x = x, y = y)

ggplot(df, aes(x = x, y = y)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE) +
  stat_function(fun = function(x){x^2 + b}, color = 'red') 

regression = lm(data = df, y ~ x)
summary(regression)

## 3) Presencia de datos atípicos
n = 100
x = runif(n, -2, 2)
a = 1
b = 2
e = rnorm(n, 0, 1/2)
y = a*x + b + e
x[1] = 100
y[1] = -100
df = tibble(x = x, y = y)

ggplot(df, aes(x = x, y = y)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE) +
  geom_abline(slope = a, intercept = b, color = 'red') 

regression = lm(data = df, y ~ x)
summary(regression)

## 4) Errores no normales
n = 100
x = runif(n, -2, 2)
a = 1
b = 2
e = rexp(n, 2) - 1/2
y = a*x + b + e
df = tibble(x = x, y = y)

ggplot(df, aes(x = x, y = y)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE) +
  geom_abline(slope = a, intercept = b, color = 'red') 

regression = lm(data = df, y ~ x)
summary(regression)

## 5) Errores no independientes
n = 100
x = runif(n, -pi/2, pi/2)
a = 1
b = 2
e = rnorm(1, 0, 1/2) + rnorm(n, 0, 1/2)
y = a*x + b + e
df = tibble(x = x, y = y)

ggplot(df, aes(x = x, y = y)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE) +
  geom_abline(slope = a, intercept = b, color = 'red') 

regression = lm(data = df, y ~ x)
summary(regression)