**# let y-axis as density, use prob = T in hist()**
hist(exp_means,
main="Histogram of Exponential Means", xlab=expression(bar(X)),
prob = T,
breaks=20)
# add density lines into hist plot
x <- seq(0,1,by=.0001)
my_density <- dnorm(x,mean=mean_exp,sd=sd_exp)
lines(x,my_density,col="purple")
**# create a table of proportions**
n <- length(housing$HighValue)
my_table <- round(table(housing$HighValue,housing$Borough)/n,4)
**# plot scatter plot by groups**
plot(y = housing$logValue, x = housing$logUnits,
col = factor(housing$after1950),
main = "Plot of property logValue against property logUnits",
xlab = "Log(units of property)", ylab = "Log(property value)")
# legend with diffent groups
legend("bottomright", legend = levels(factor(housing$after1950)),
fill = unique(factor(housing$after1950)))
The cor()
function calculates the correlation coefficient between two variables. What is the correlation between property logValue and property logUnits in (i) the whole data, (ii) just Manhattan, (iv) for properties built after 1950 (v) for properties built before 1950?
**# correlation between y and conditional x**
# correlation between property logValue and property logUnits
# in the whole data
cor(housing$logValue, housing$logUnits)
# correlation between property logValue and property logUnits
# just Manhattan
cor(housing$logValue[housing$Borough == "Manhattan"],
housing$logUnits[housing$Borough == "Manhattan"])
# correlation between property logValue and property logUnits
# for properties built after 1950
cor(housing$logValue[housing$after1950], housing$logUnits[housing$after1950])
**# resample data**
# function of resample position value
resample1 <- sample(1:n, n, replace = TRUE)
# resample position value into a matrix
B <- 1000
resampled_values <- matrix(NA, nrow = B, ncol = n)
for (b in 1:B) {
resampled_values[b, ] <- sample(1:n, n, replace = TRUE)
}
**# Bootstrap**
# Bootstrap Resamples
resampled_ests <- matrix(NA, nrow = B, ncol = 2)
colnames(resampled_ests) <- c("Intercept_Est", "Slope_Est")
for (b in 1:B) {
resampled_rows <- resampled_values[b, ]
resampled_data <- diamonds[resampled_rows, ]
resampled_ests[b, ] <- coefficients(lm(price ~ carat,
data = resampled_data))
}
# Bootstrap Intervals
Cl <- 2*coefficients(lm0)[2] - quantile(resampled_ests[, "Slope_Est"], 0.975)
Cu <- 2*coefficients(lm0)[2] - quantile(resampled_ests[, "Slope_Est"], 0.025)
# permutation test
# mean have difference or not to get the result – data of female and male
# ***have the same chance be random select***
girlcats <- cats$Sex == "F"
Dhat <- mean(cats$Hwt[girlcats]) - mean(cats$Hwt[!girlcats])
nf <- sum(girlcats)
nm <- sum(!girlcats)
P <- 10000
sample_diffs <- rep(NA, P)
for (i in 1:P) {
perm_data <- cats$Hwt[sample(1:(nf+nm))]
meanf <- mean(perm_data[1:nf])
meanm <- mean(perm_data[-(1:nf)])
sample_diffs[i] <- meanf - meanm
}
pval <- mean(abs(sample_diffs) >= abs(Dhat))
**# ggplot**
# scatter plot with groups and have lines
ggplot(data=iris)+
geom_point(mapping = aes(x=Sepal.Length,y=Petal.Length, color = Species))+
geom_smooth(method=lm,se=FALSE,fullrange=TRUE,
aes(x=Sepal.Length,y=Petal.Length, color =Species))
# line with same x and different y in different column
ggplot(data = wtid) +
geom_line(mapping = aes(x = Year, y = P99,color="P99"))+
geom_line(mapping = aes(x = Year, y = P99.5,color="P99.5")) +
geom_line(mapping = aes(x = Year, y = P99.9,color="P99.9")) +
labs(title = "Thresholds for the Richest People Over Time",
x = "Year", y = "Threshold Amount",color="Percentile")
# bar plot
ggplot(data=titanic) +
geom_bar(aes(x=Sex,fill=factor(Survived)))+
labs(title = "Title",fill="Survived")
# bar plot with seperate plot in same picture
ggplot(data=titanic) +
geom_bar(aes(x=factor(Survived),fill=factor(Survived)))+
facet_grid(~Sex)+
labs(title = "Title",fill="Survived",x="")
# histogram with density y-axis and density line
x_range <- seq(0, 1.5, 0.02)
hist_beta1000 <- as.data.frame(x_beta <- rbeta(1000, 3,1))
plot_b1000 <- as.data.frame(x_range, dbeta(x_range, 3, 1))
ggplot(hist_beta1000) +
geom_histogram(mapping =
aes(x = x_beta, y = ..density..), binwidth = 0.04) +
geom_line(plot_b1000, mapping =
aes(x = x_range, y = dbeta(x_range, 3, 1)), col = "blue")