## ----CAP394_PCDDataScience_Read01---------------------------------------- file = "Data/PCDs/PCD_20060101_20061231.dat" # Let's read the file, using spaces as separators, with the first line as the header, # read only 10 rows, missing strings will be represented as NAs, # strings will not be factorized. data <- read.table(file, sep = "", header = TRUE, nrows = 10, na.strings ="NA", stringsAsFactors = FALSE) ## ----CAP394_PCDDataScience_Read02---------------------------------------- str(data) ## ----CAP394_PCDDataScience_Read03---------------------------------------- # Let's read one line of the file, expecting it to contain strings, # without messages to the console. scannedNames <- scan(text = readLines(file, 1), what="",quiet = TRUE) # Let's remove the first element of the resulting vector. scannedNames <- scannedNames[-1] scannedNames ## ----CAP394_PCDDataScience_Read04---------------------------------------- # Let's read the file, using spaces as separators, ignoring the header, # missing strings will be represented as NAs, strings will not be factorized. data <- read.table(file, sep = "", header = FALSE, na.strings ="NA", stringsAsFactors = FALSE, col.names = scannedNames) ## ----CAP394_PCDDataScience_Read05---------------------------------------- colnames(data) <- c("station","longitude","latitude","altitude","relhumidity", "maxtemp","mintemp","rainfall","date") ## ----CAP394_PCDDataScience_Read05s--------------------------------------- str(data) ## ----CAP394_PCDDataScience_Read05s2-------------------------------------- # Get the header of a subset of the data frame containing only # station, maxtemp, mintemp, rainfall, date. head(subset(data, select = c(station,maxtemp,mintemp,rainfall,date))) ## ----CAP394_PCDDataScience_Clean01--------------------------------------- # Select all columns but longitude,latitude,altitude data <- subset(data,select=-c(longitude,latitude,altitude)) str(data) ## ----CAP394_PCDDataScience_Clean02--------------------------------------- # Get the date column as a character, convert it to date using the format YYYYMMDD. data$date <- as.Date(as.character(data$date), "%Y%m%d") ## ----CAP394_PCDDataScience_Clean03--------------------------------------- # Replace all 999s in column relhumidity data$relhumidity[data$relhumidity == 999] <- NA # Replace all 9999.99s in column maxtemp data$maxtemp[data$maxtemp == 9999.99] <- NA # Replace all 9999.99s in column mintemp data$mintemp[data$mintemp == 9999.99] <- NA # Replace all 9999.99s in column rainfall data$rainfall[data$rainfall == 9999.99] <- NA str(data) ## ----CAP394_PCDDataScience_Question01------------------------------------ # Get the minimum and maximum value of the respective fields, disregarding NAs. min(data$mintemp,na.rm = TRUE) max(data$maxtemp,na.rm = TRUE) ## ----CAP394_PCDDataScience_Question02------------------------------------ # Get the rows where the mintemp is the minimum. # This does not work: data[data$mintemp==-5,] because it will also consider # NA values # https://stackoverflow.com/questions/1686569/filter-data-frame-rows-by-a-logical-condition subset(data, mintemp==min(data$mintemp,na.rm = TRUE)) # Do the same for maxtemp. subset(data, maxtemp==max(data$maxtemp,na.rm = TRUE)) ## ----CAP394_PCDDataScience_Question03------------------------------------ # Create new deltatemp column by an operation on the mintemp and maxtemp. data$deltatemp = data$maxtemp-data$mintemp ## ----CAP394_PCDDataScience_Question04------------------------------------ subset(data, deltatemp==max(data$deltatemp,na.rm = TRUE)) ## ----CAP394_PCDDataScience_Viz01,fig.width=12,fig.height=6--------------- # Select only the variables relevant to the plot: date (x axis), mintemp (y axis) # and station (grouping) subset <- subset(data,select=c(date,station,mintemp)) # ggplot2 requires the data frame to be reshaped, using data and station as ids. # https://stackoverflow.com/questions/13324004/plotting-multiple-time-series-in-ggplot library(reshape2) melted <- melt(subset,id=c("date","station")) # Now we can plot the multiple time series. library(ggplot2) ggplot(melted,aes(x=date,y=value,colour=station,group=station)) + geom_line() ## ----CAP394_PCDDataScience_Viz02,fig.width=12,fig.height=6--------------- subset2 <- subset[subset$station %in% c("32492","32565","32548","32595","32465","31998"), ] melted <- melt(subset2,id=c("date","station")) # Plot the three time series. ggplot(melted,aes(x=date,y=value,colour=as.factor(station),group=station)) + geom_line() ## ----CAP394_PCDDataScience_Viz03,fig.width=12,fig.height=6--------------- subset3 <- subset[subset$station %in% c("31998"), ] melted <- melt(subset3,id=c("date","station")) # Plot the three time series. ggplot(melted,aes(x=date,y=value,colour=as.factor(station),group=station)) + geom_line()