R Programming

My R manual (work in progress)

https://drive.google.com/file/d/1CBN0usv0-UBNexJGhqXSrnoSw9bL2F4Z/view?usp=sharing

 

Excel reading

library(readxl)

main <- read_excel("public high schools CCD.xlsx",sheet="PSM")
arc <- read_excel("public high schools CCD.xlsx",sheet="ARC")

 

String manipulation

https://www.r-bloggers.com/basic-text-string-functions-in-r/

#get a county name from column agency in main
main$pos = regexpr('COUNTY', main$agency)
main$county = substr(main$agency,1,main$pos-1)

 

Keep when a variable's value is X

GrowthMindset<-filter(ff,var_name2=="% Vali" & var_name3=="")

 

Grep (Keep rows when a variable includes a certain string)

census <- read.csv(file = 'DP02.csv')

WV<- census[grep("West Virginia", census$GEONAME), ]

 

 

Histogram

hist(treat$grandtotal,breaks=100)

 

Paste

abc="this"

def="pen"

paste(abc, "is a", def)

 

Missing values

ref_2018c %>%
mutate(value_missing=case_when(
AuditDate_68 >= -999 ~ 0,
is.na(AuditDate_68) ~ 1,
)) ->ref_2018c2

 

Reading from a CSV file and get a freq on a string variable.

setwd("C:/Users/….")

temp <- read.csv(file="filenamehere.csv",header=TRUE,sep=",")
temp2<-as.data.frame(table(temp$Exit.Reason))

 

Function

kaz_macro1<-function(var1){
var1 %>%
mutate(GrowthMindset = (q0008_0001+q0008_0002+q0008_0003+q0008_0004+q0008_0005+q0008_0006+q0008_0007+q0008_0008)/8) %>%
mutate(SelfEfficacy = (q0009_0001+q0009_0002+q0009_0003+q0009_0004+q0009_0005)/5) %>%
mutate(MSelfEfficacy = (q0010_0001+q0010_0002+q0010_0003+q0010_0004+q0010_0005+q0010_0006+q0010_0007)/7) %>%
mutate(MathAnxiety = (q0011_0001+q0011_0002+q0011_0003+q0011_0004+q0011_0005+q0011_0006)/6) %>%
mutate(TeacherUse = (q0012_0001+q0012_0002+q0012_0003+q0012_0004+q0013_0001+q0013_0002+q0013_0003+q0013_0004)/8) -> var1

subset(var1,select=c(flag,dataID,commonID,treat,GrowthMindset,SelfEfficacy,MSelfEfficacy,MathAnxiety,TeacherUse))
}
wholedata2 <- kaz_macro1(wholedata)
data_Male2 <- kaz_macro1(data_Male)

 

 

In SAS

%let var1=gender;

In R

var1<-gender

 

Function

this <-function(x){
x*24
}

this(5)

 

Works like IF statements

wholedata %>%
mutate(gender=case_when(
q0014==1 ~"Male",
q0014==2 ~"Female"
)) ->wholedata_a

 

wholedata %>%
mutate(gender=case_when(
q0014==1 ~"Male",
q0014==2 ~"Female")) %>%
mutate(race=case_when(
q0015==1 ~"White",
q0015==2 ~"non White"
)) ->wholedata_a

 

#all variable names will become lowercases
var.names<-tolower(colnames(data_name))
colnames(data_name)<-var.names

Simple recoding:

How to Recode a Column with dplyr in R?

 

 

Fixed date

https://www.stat.berkeley.edu/~s133/dates.html

wholedata$year1start <- as.Date('2012-8-1')

 

Calendar date variable in R

https://www.r-bloggers.com/date-formats-in-r/

Example.  This is for a date variable coming from an Excel file (which sets the origin date to 1899-12-30).

wholedata$ReferralDate_1001_SAS <- as.Date(wholedata$ReferralDate_1001,origin = "1899-12-30")

 

 

R instruction by matloff

https://github.com/matloff/fasteR

 

Function (Thanks, Aisaku)

mydata = cars
colnames(mydata)

proc_means = function(var){
x = list(summary= summary(var),
sd = sd(var),
N = length(var),
number_of_NA = sum(is.na(var)))
return(x)
}

proc_means(mydata$speed,)
proc_means(mydata$dist)

 

describe function (from psych package)

https://personality-project.org/r/html/describe.html

 

Getting a mean when the values include missing values

mean(temp2$mem1,na.rm=TRUE)

 

Recoding using SQLDF

temp2<-sqldf("
SELECT team_mem_1,
CASE WHEN team_mem_1='No' then 0 WHEN team_mem_1='' then NULL ELSE 1 END AS mem1,
CASE WHEN team_mem_2='No' then 0 WHEN team_mem_2='' then NULL ELSE 1 END AS mem2,
CASE WHEN team_mem_3='No' then 0 WHEN team_mem_3='' then NULL ELSE 1 END AS mem3,
CASE WHEN team_mem_4='No' then 0 WHEN team_mem_4='' then NULL ELSE 1 END AS mem4,
CASE WHEN team_mem_5='No' then 0 WHEN team_mem_5='' then NULL ELSE 1 END AS mem5,
CASE WHEN team_mem_6='No' then 0 WHEN team_mem_6='' then NULL ELSE 1 END AS mem6,
CASE WHEN team_mem_7='No' then 0 WHEN team_mem_7='' then NULL ELSE 1 END AS mem7,
CASE WHEN team_mem_8='No' then 0 WHEN team_mem_8='' then NULL ELSE 1 END AS mem8,
CASE WHEN team_mem_9='No' then 0 WHEN team_mem_9='' then NULL ELSE 1 END AS mem9,
CASE WHEN team_mem_10='No' then 0 WHEN team_mem_10='' then NULL ELSE 1 END AS mem10,
CASE WHEN team_mem_11='No' then 0 WHEN team_mem_11='' then NULL ELSE 1 END AS mem11,
FROM temp1;
")

 

SQL

sqldf("SELECT DIV_NAME, SUM(DIV_NUM) as X FROM t1
GROUP BY DIV_NAME
ORDER BY X DESC
LIMIT 3;
" )

 

Saving data

temp2<-sqldf("SELECT DIV_NAME, SUM(DIV_NUM) as X FROM t1
GROUP BY DIV_NAME
ORDER BY X DESC
LIMIT 3;
" )

 

 

 

Indexing

deck[1:10,]

 

Lists

list1<-list(100:130, "R",list(TRUE,FALSE))
list1

 

Functions

roll <-function(){
die <-1:6
dice <-sample(die,size=2,replace=TRUE)
sum(dice)
dice
}
roll()

roll <-function(die){

dice <-sample(die,size=2,replace=TRUE)
sum(dice)
dice
}

roll(die <-1:6)

 

 

 

library(ggplot2)

roll <-function(){
die <-1:6
dice <-sample(die,size=2,replace=TRUE,
prob=c(1/8, 1/8, 1/8, 1/8, 1/8, 3/8))
sum(dice)
}

rolls<-replicate(100000,roll())
qplot(rolls,binwidth=1)

 

 

#sorting data

attach(t1)
t1 <- t1[order(SECTION_COURSES_ID),]

 

If statement

if (x == 1){print("x is 1")} else if {print("x is not 1!")}

RECODING

From a text to a text

t1 <- t1 %>% mutate(course_type = case_when(
LOCAL_COURSE_TITLE == '22 Algebra I Hon, YR' ~ 'ALG 1',
LOCAL_COURSE_TITLE == '22 Math 07, YR' ~ 'Pre ALG',
))

From a text to a numeric

t1 <- t1 %>% mutate(treat = case_when(
LICENSE_NUM == 'AA1' ~ 1,
LICENSE_NUM == 'AA5' ~ 1
))

 

SQL in R

library(sqldf)

sqldf('SELECT data_from, dropout,FINALGPA, FINALGPA+2 FROM parkrose1 WHERE dropout = 1 ORDER BY FINALGPA ASC')

 

# Add new character variable agecat to abaloneMod
abaloneMod <- abaloneMod %>%
mutate(agecat = ifelse(test = age < 10.5,
yes = "< 10.5",
no = "10.5 and older"))

 

#create two samples off the data

set.seed(567)
index_random <-sample(1:nrow(coh2both),2/3*nrow(coh2both))
training_set<-coh2both[index_random,]
test_set<-coh2both[-index_random,]

 

#create a categorical variable

coh2both$gpa_cat<-rep(NA,length(coh2both$GPA_))
coh2both$gpa_cat[which(coh2both$GPA_ <=2)]<-"less than 2"
coh2both$gpa_cat[which(coh2both$GPA_ > 2)]<-"gt than 1"

 

Replace a missing value with a median

median_gpa=median(coh2both$GPA_,na.rm=TRUE)
temp1<-coh2both
temp1$GPA_[na_index]<-median_gpa
summary(temp1$GPA_)
mean(temp1$GPA_)

 

Remove a variable

x$treat <-NULL

 

Subsetting data

select_treat<-which(coh2both2$treat == 1)
comparison <- coh2both2[-select_treat, ]
treat <- coh2both2[+select_treat, ]

 

na_index<-which(is.na(coh2both2$GPA_))
result<-coh2both2[-na_index,]

 

Zscore

zscore<--scale(coh2both2$post_reach15)
mean(zscore)
var(zscore)

Chi-square test

table(coh2both2$district,coh2both2$treat)

 

#create a change score
coh2both %>%
mutate(growth=post_reach15-pre_reach15 ) ->coh2both2
plot(coh2both2$growth,coh2both2$GPA_,xlab="change",ylab="gpa")summary(coh2both2.list)
mean(coh2both2$pre_reach15 <2 )
table(coh2both2$district)

table(coh2both2$district,coh2both2$treat)

outlier detection

index_out<-which(coh2both$GPA_ > 1)
x<-coh2both[-index_out,]

Bivariate Plot
plot(coh2both$pre_reach15,coh2both$post_reach15)

Create a variable

#create a change score
coh2both %>%
mutate(growth=post_reach15-pre_reach15 ) ->coh2both2

String manipulation

library(stringr)

str_trim(" this is a test")

 

Calendar variables

# Preview students2 with str()
str(students2)

# Load the lubridate package
library(lubridate)

# Parse as date
dmy("17 Sep 2015")

# Parse as date and time (with no seconds!)
mdy_hm("July 15, 2012 12:56")

# Coerce dob to a date (with no time)
students2$dob <- ymd(students2$dob)

 

Change the variable type 

https://campus.datacamp.com/courses/cleaning-data-in-r/1828?ex=3

# Preview students with str()
str(students)

# Coerce Grades to character
students$Grades <- as.character(students$Grades)

# Coerce Medu to factor
students$Medu <- as.factor(students$Medu)

# Coerce Fedu to factor
students$Fedu <- as.factor(students$Fedu)

 

 

Dates with lubridate

https://campus.datacamp.com/courses/cleaning-data-in-r/1828?ex=1

library(lubridate)

 

The use of na.rm=TRUE

mean(x,na.rm=TRUE)

This above is an example of how telling R not to be bothered by a missing value.  Without the na.rm specificaiton, the function will not return a value.

 

Combining two variables into one.

psmdata$cohort_t_status<-paste0(psmdata$cohortid,"-",psmdata$treat)
by_cohort <- psmdata %>%
group_by(cohort_t_status) %>%
summarize(meanACT=mean(act_composite))
ggplot(by_cohort,aes(x=cohort_t_status,y=meanACT))+geom_col()

 

 

Attach() and detach()

http://www.statmethods.net/management/aggregate.html

 

How to convert SAS and other data files into R files:

http://www.ats.ucla.edu/stat/r/faq/inputdata_R.htm

 

SQL in R

sqldf

Function example:

addition = function(num1,num2){ answer = num1+num2 return(answer) } addition(10,9) addition(5,4)

 

Change working directly (Notice the slash is / not \ even on Windows)

setwd("C:/R")

You can check the current working directly by submitting:

getwd()

Read this with more details by clicking here:

How to set a working directory in R

 

Convert a CSV file into a readable dataset

temp <- read.csv(file="practice_data.csv",header=TRUE,sep=",")

type the name of the dataset to see what you just did (if big, you will see a lot of data, though not all):

temp

This would do the same thing:

temp = read.csv(file="practice_data.csv",header=TRUE,sep=",")

 

You can check what datasets you have activated by:

objects()

Print a variable off a dataset

temp2 <- temp$height

Quick look at the data

str(temp)         <This descrives the structure of data.>

head(temp)      <This prints the first 6 observations.>

Get the descriptive summary of the data or variables
summary(temp)
summary(temp$weight)

mean(BOD$demand)        <BOD is a default dataset.>

hist(BOD$demand)   <Histogram is created.>

hist(BOD$demand,breaks=4)   <The bar breask every 4 units>

boxplot(BOD$demand)

 

Create new variables

newvar=BOD$demand*2
newvar

 

How to implement an algorithm using R

Misllaneous comands

library()       You can tell which packages you downloaded

search()        This does something similar.

data()       Show available default datasets

if you type the name of the data, you can see it.  For example:

BOD

 

Reference

https://sites.google.com/site/webtextofr/  (Japanese)

Algebra and R http://atcm.mathandtech.org/EP2008/papers_full/2412008_14997.pdf