-
Notifications
You must be signed in to change notification settings - Fork 0
/
lab2.R
143 lines (113 loc) · 4.85 KB
/
lab2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
####Stat 10 - Lab #2
## Name: Callista Wu
## SID: 904886275
## TA: Gabriel Ruiz
## Lab/Discussion Time: 1C/10AM
######################################################
#Lab 2 - Data Cleaning/Preparation and Visualization
######################################################
###Intro to Logical Statements/Relational Operators
#Relational Operations
4>3 #is 4 greater than 3?
c(3,8) >= 3 #is 3 or 8 greater than or equal to 3?
c(3,8) <= 3 #is 3 or 8 less than or equal to 3?
c(1,4,9) == 9 #is 1,4, or 9 exactly equal to 9?
c(1,4,9) !=9 #is 1,4, or 9 not (exactly) equal to 9?
##Create an object with the baby weights from NCbirths
#Applications of logical statements: calculations
sum(NCbirths$weight > 100) #the number of babies that weighed more than 100 ounces
mean(NCbirths$weight > 100) #the proportion of babies that weighed more than 100 ounces
mean(NCbirths$gender == "Female") #the proportion of female babies
mean(NCbirths$gender != "Male") #gives the proportion of babies not assigned male
#Applications of logical statements: subsets
fem_weights <- NCbirths$weight[NCbirths$gender == "Female"]
######################################################
###Exercise 1
#a) Reading in data
head(flint)
class(flint)
#b) Dangerous lead levels
library(mosaic)
dangerousPb_indicator = (flint$Pb >= 15)
tally(~dangerousPb_indicator,format="proportion")
sum(dangerousPb_indicator)/length(dangerousPb_indicator)
#c) Mean Copper Level for only test sites in North region
north_flint = flint[flint$Region=="North",]
#finally: mean copper level
mean(north_flint$Cu)
#d) Mean Copper Level for only test sites in North region
dangerousPb_flint = flint[flint$Pb>=15,]
#finally: mean copper level
mean(dangerousPb_flint$Cu)
#e) Mean Lead and Copper Levels
mean(flint$Pb)
mean(flint$Cu)
#f) Create a box plot for the lead levels
boxplot(x = flint$Pb, xlab="Lead", ylab="Amount of Lead Levels", main="Lead Levels in Flint")
#g) Mean vs. median for lead levels in Flint
#no, the mean is not the best measure of center
#because the distribution of data is not symmetric
mean(flint$Pb)
median(flint$Pb)
######################################################
###Exercise 2
#reading data
life <- read.table("http://www.stat.ucla.edu/~nchristo/statistics12/countries_life.txt",header=TRUE)
head(life)
#a) scatterplot of 'life' variable against 'income' variable
# both in 'life' data.frame
plot(x=life$Income, y=life$Life,
xlab="Income", ylab="Life Expectancy", main="Life Expectancy vs. Income")
#b) boxplot and histogram of 'income'
hist(life$Income, xlab="Income", ylab="Life Expectancy",
main="Life Expectancy vs. Income")
boxplot(life$Income, xlab="Income", ylab="Life Expectancy",
main="Life Expectancy vs. Income")
#c) subset 'life' data.frame into two parts:
# one for which income < 1000, the other for which income > 1000
below1k <- life[life$Income<1000,]
above1k <- life[life$Income>=1000,]
#d) use 'below1k' data.frame. plot 'life' vs. 'income' and compute correlation
plot (x=below1k$Income, y=below1k$LIfe,
xlab="Income", ylab="Life Expectancy", main="Life Expectancy vs. Income")
cor (x=below1k$Income, y=below1k$Life)
######################################################
###Exercise 3
#reading data
maas <- read.table("http://www.stat.ucla.edu/~nchristo/statistics12/soil.txt", header=TRUE)
head(maas)
#a) summary statistics for lead and zinc
summary(maas$lead)
summary(maas$zinc)
#b) histogram of lead and log(lead)
hist(maas$lead, xlab="Lead Levels", ylab="Frequency",
main="Frequency of Lead Levels")
hist(log(maas$lead), xlab="log(Lead) Levels", ylab="Frequency",
main="Frequency vs. log(Lead)")
#c) log(lead) vs. log(zinc)
plot(x=log(maas$zinc),y=log(maas$lead),
xlab="log(zinc)", ylab="log(lead)",
main="log(zinc) vs. log(lead)")
#d) color scheme for lead concentration risk for 155 locations
lead_colors <- c("blue", "purple", "yellow")
lead_levels <- cut(maas$lead, c(0,150,400,1000))
#plot
plot(maas$x, maas$y, cex=maas$lead/mean(maas$lead),
col=lead_colors[as.numeric(lead_levels)], pch=19,
xlab="Lead Concentration (ppm)", ylab="Risk Level",
main="Lead Concentration (ppm) vs. Risk Level")
######################################################
###Exercise 4
LA <- read.table("http://www.stat.ucla.edu/~nchristo/statistics12/la_data.txt", header=TRUE)
#a) plot the data point locations and overlay a map
library(maps)
plot(x=LA$Longitude, y=LA$Latitude, asp=1, ylab="Latitude", xlab="Longitude",
main="Approximate Centers of Los Angeles Neighborhoods", xlim=c(-119,-118), ylim=c(33,35))
map("county", "california", add = TRUE)
#b) relationship between income and school performance?
# ignore data points on the plot for which schools = 0
LA.subset <- LA[LA$Schools>0,]
plot(x=LA.subset$Income, y=LA.subset$Schools,
main="Income vs. School Performance",
xlab="Income", ylab="School Performance")
cor (x=LA.subset$Income, y=LA.subset$Schools)