-
Notifications
You must be signed in to change notification settings - Fork 0
/
teamatad.R
37 lines (27 loc) · 1.19 KB
/
teamatad.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#import data
kaggle_data <- read.csv("kaggle_survey_2021_responses.csv", stringsAsFactors = TRUE)
head(kaggle_data)
##list of column names
colnames(kaggle_data)
##clean data by removing first row
kaggle_data <- kaggle_data[-1,]
## all job titles
head(kaggle_data[["Q5"]])
levels(kaggle_data[["Q5"]])
jobtitles <- c("Business Analyst","Data Analyst","Data Engineer", "Data Scientist", "Machine Learning Engineer","Research Scientist", "Software Engineer", "Statistician")
nrow(kaggle_data)
#extract all the rows from kaggle_data with these job titles.
#extract all the rows from kaggle_data with the cols we are interested in
head(as.character(kaggle_data[,"Q5"]) == jobtitles)
index <- (as.character(kaggle_data[,"Q5"])) %in% jobtitles
new_data <- kaggle_data[index,]
nrow(new_data)
focused_df <- cbind(new_data$Q5, new_data$Q4, new_data$Q6, new_data$Q8)
head(focused_df)
#plot job title frequencies
plot(new_data[["Q5"]], col = 1:8)
#degree exploration
new_data[["Q4"]] <- droplevels(new_data[["Q4"]])
levels(new_data[["Q4"]]) <- c("Bachelor's", "Doctoral", "No Response", "Master's", "No education", "Professional Doctorate", "Some college/university")
cor(focused_df)
#moderate correlation between Q5 and Q6