# create local SQLite database using airline delays # Nicholas Horton, nhorton@amherst.edu 4/11/2015 # this script assumes that the following files have been downloaded # from http://www.amherst.edu/~nhorton/precursors/files: # airplanes.csv, airlines.csv, airports.csv, 2014.csv.bz2, test-sqlite.Rmd # for more information, see http://www.amherst.edu/~nhorton/precursors require(mosaic); require(dplyr); require(readr); require(RSQLite) dbname = "ontime.sqlite3" createIndex <- TRUE start <- Sys.time(); start # first read the small tables tables = c("airplanes", "airlines", "airports") con <- dbConnect(SQLite(), dbname) for (table in tables) { df <- read_csv(paste(table,".csv", sep="")) if (!is.data.frame(df)) next message("Creating table: ", table) dbWriteTable(con, table, as.data.frame(df), overwrite=TRUE) } # now read the larger table (flights) years = as.character(1987:2014) # need to download all of the other years for (whichyear in years) { cat("processing", whichyear) flights <- read_csv(paste(whichyear,".csv.bz2", sep="")) flights <- rename(flights, DayOfMonth = DayofMonth) cat(":loading") dbWriteTable(con, "flights", as.data.frame(flights), append=TRUE) cat("\n") firstFlight = FALSE } if (createIndex==TRUE) { cat("Creating indices.\n") dbGetQuery(con, "CREATE INDEX Year on flights(Year);") dbGetQuery(con, "CREATE INDEX YearMon on flights(Year, Month);") dbGetQuery(con, "CREATE INDEX Date on flights(Year, Month, DayOfMonth);") dbGetQuery(con, "CREATE INDEX YearMonDest on flights(Year, Month, Dest);") dbGetQuery(con, "CREATE INDEX YearDayDest on flights(Year, Month, DayOfMonth, Dest);") dbGetQuery(con, "CREATE INDEX YearDayCarr on flights(Year, Month, DayOfMonth, UniqueCarrier);") dbGetQuery(con, "CREATE INDEX Origin on flights(Origin);") dbGetQuery(con, "CREATE INDEX Dest on flights(Dest);") dbGetQuery(con, "CREATE INDEX TailNum on flights(TailNum);") } end <- Sys.time(); end # let's see what has been created dbListTables(con) dbListFields(con, "airports") # and now we can access these within dplyr my_db <- src_sqlite(path=dbname) airports <- tbl(my_db, "airports") airplanes <- tbl(my_db, "airplanes") airlines <- tbl(my_db, "airlines") flights <- tbl(my_db, "flights") filter(airports, IATA %in% c("ALB", "BTV", "BDL")) # see test-sqlite.Rmd for examples of analyses using these data sources