This R notebook accompanies the blog post “What is the yearly risk of another Harvey-level flood in Houston?”, which contains a more high-level description of this analysis and the conclusions.

library("rvest") # for web scraping
## Loading required package: xml2
library("stringr") # for str_extract()
library("lubridate") # for nice dates
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library("ggplot2") # for faceted plots
library("sROC") # for kernel CDF estimation
## sROC 0.1-2 loaded
library("quantreg") # for additive quantile regression
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
library("coda") # for HPDinterval()
library("mgcv") # for gam()
## Loading required package: nlme
## This is mgcv 1.8-17. For overview type 'help("mgcv-package")'.
library("boot") # for time series bootstrap
library("flexmix") # for mixture modeling
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
## 
##     melanoma
## 
## Attaching package: 'flexmix'
## The following object is masked from 'package:boot':
## 
##     boot
library("bayesplot") # for mcmc_areas()
## This is bayesplot version 1.3.0
## Plotting theme set to bayesplot::theme_default()
theme_set(theme_grey()) # brms changes default ggplot theme...

logit <- function(p) log(p/(1-p))
invlogit <- function(l) 1/(1+exp(-l))

Get the data

We want the site numbers for every USGS gage in Harris County, TX. The Harris County Flood Warning System website has a list of all the USGS gages in the county, but sites have different site numbers on the Harris FWS site vs. the USGS site. The Harris site numbers are easy to grab by hand from the HTML, so I did it manually:

harris_sites <- c(100,105,110,115,120,125,130,135,140,150,160,170,180,
                  190,200,210,220,230,240,250,270,310,320,340,360,370,
                  380,400,410,420,430,435,440,445,460,465,470,475,480,
                  485,490,495,510,520,530,535,540,545,550,555,560,570,
                  575,580,585,590,595,610,620,640,710,720,750,755,760,
                  790,820,830,840,920,940,1000,1020,1040,1050,1060,1070,
                  1074,1076,1084,1086,1090,1110,1115,1120,1130,1140,1150,
                  1160,1165,1170,1180,1185,1186,1190,1195,1210,1220,1230)

With these we can scrape the USGS gage numbers from the Harris County FWS website:

usgs <- sapply(harris_sites, function(site){
  # grab the HTML
  url <- paste0('https://www.harriscountyfws.org/GageDetail/Index/', site)
  page <- read_html(url)
  
  # extract all hrefs
  a_nodes <- html_nodes(page, 'a')
  hrefs <- na.omit(sapply(html_attrs(a_nodes), function(x) x['href']))
  
  # use regex to grab the site numbers
  site_no <- sapply(hrefs, function(x) str_extract(x, '(?<=site_no=).+(?=&)'))
  c(na.omit(site_no))
})
unlist(usgs)
##       href       href       href       href       href       href 
## "08076997" "08075500" "08075400" "08075000" "08074810" "08074760" 
##       href       href       href       href       href       href 
## "08074800" "08074500" "08074020" "08074540" "08074250" "08074150" 
##       href       href       href       href       href       href 
## "08072050" "08069500" "08075770" "08075730" "08069000" "08068900" 
##       href       href       href       href 
## "08068800" "08068720" "08068700" "08068780"

Construct and run the HTML query:

# comma-separate the site numbers
query <- paste(unlist(usgs), collapse=',')
# build the query
query <- paste0(
  'https://waterservices.usgs.gov/nwis/dv/?sites=',
  query,
  '&period=P1044W&format=rdb&indent=on&variable=00065&statCd=00003')

# retrieve the dataset
dat_html <- html_text(read_html(query))
substr(dat_html, 1, 1000)
## [1] "# ---------------------------------- WARNING ----------------------------------------\n# Some of the data that you have obtained from this U.S. Geological Survey database may not \n# have received Director's approval.  Any such data values are qualified as provisional and \n# are subject to revision.  Provisional data are released on the condition that neither the \n# USGS nor the United States Government may be held liable for any damages resulting from its use.\n#  Go to http://help.waterdata.usgs.gov/policies/provisional-data-statement for more information.\n#\n# File-format description:  http://help.waterdata.usgs.gov/faq/about-tab-delimited-output\n# Automated-retrieval info: http://help.waterdata.usgs.gov/faq/automated-retrievals\n#\n# Contact:   gs-w_support_nwisweb@usgs.gov\n# retrieved: 2017-09-25 16:21:29 -04:00\t(natwebsdas01)\n#\n# Data for the following 21 site(s) are contained in this file\n#    USGS 08068700 Cypress Ck at Sharp Rd nr Hockley, TX\n#    USGS 08068720 Cypress Ck at Katy-Ho"

Format and clean up the data

Split the data into the header text and the actual data table:

# split the text into a vector of lines
splitted <- strsplit(dat_html, split="\n", fixed=TRUE)[[1]]

# grab the header text
header <- splitted[1:47]

# grab the data lines (they start with "USGS") and read into a table
dat <- tail(splitted, -50)
dat <- dat[substr(dat, 1, 4) == "USGS"]
dat <- read.table(text=paste(dat, collapse="\n"), stringsAsFactors=FALSE)
names(dat) <- c("agency", "site", "date", "height", "code")

str(dat)
## 'data.frame':    132528 obs. of  5 variables:
##  $ agency: chr  "USGS" "USGS" "USGS" "USGS" ...
##  $ site  : int  8068700 8068700 8068700 8068700 8068700 8068700 8068700 8068700 8068700 8068700 ...
##  $ date  : chr  "1999-10-01" "1999-10-02" "1999-10-03" "1999-10-04" ...
##  $ height: num  54 54 54 54 54 ...
##  $ code  : chr  "A" "A" "A" "A" ...

The data should contain only “A” (approved) and “P” (pending) codes, but there are a small handful labeled “A:e” or “P:e” for some reason, with no explanation to be found. Set them to “A” or “P”, respectively.

table(dat$code)
## 
##      A    A:e      P    P:e 
## 127258      1   5262      7
dat$code <- substr(dat$code, 1, 1)
unique(dat$code)
## [1] "A" "P"

Convert date strings to much nicer lubridate format:

dat$date <- ymd(dat$date)
str(dat)
## 'data.frame':    132528 obs. of  5 variables:
##  $ agency: chr  "USGS" "USGS" "USGS" "USGS" ...
##  $ site  : int  8068700 8068700 8068700 8068700 8068700 8068700 8068700 8068700 8068700 8068700 ...
##  $ date  : Date, format: "1999-10-01" "1999-10-02" ...
##  $ height: num  54 54 54 54 54 ...
##  $ code  : chr  "A" "A" "A" "A" ...

Grab nicer site names from the header file

locs <- substr(header[15:35], 20, 100)
locs <- data.frame(site=unique(dat$site),
                   loc=paste0(c(paste0(0,1:9), 10:21),": ",locs))
dat <- merge(dat, locs)
unique(dat$loc)
##  [1] 01: Cypress Ck at Sharp Rd nr Hockley, TX             
##  [2] 02: Cypress Ck at Katy-Hockley Rd nr Hockley, TX      
##  [3] 03: Little Cypress Ck nr Cypress, TX                  
##  [4] 04: Cypress Ck at Grant Rd nr Cypress, TX             
##  [5] 05: Cypress Ck at Stuebner-Airline Rd nr Westfield, TX
##  [6] 06: Cypress Ck nr Westfield, TX                       
##  [7] 07: W Fk San Jacinto Rv nr Humble, TX                 
##  [8] 08: San Jacinto Rv nr Sheldon, TX                     
##  [9] 09: Whiteoak Bayou at Alabonson Rd, Houston, TX       
## [10] 10: Cole Ck at Deihl Rd, Houston, TX                  
## [11] 11: Brickhouse Gully at Costa Rica St, Houston, TX    
## [12] 12: Whiteoak Bayou at Houston, TX                     
## [13] 13: Brays Bayou at Alief, TX                          
## [14] 14: Keegans Bayou at Roark Rd nr Houston, TX          
## [15] 15: Brays Bayou at Gessner Dr, Houston, TX            
## [16] 16: Brays Bayou at Houston, TX                        
## [17] 17: Sims Bayou at Hiram Clarke St, Houston, TX        
## [18] 18: Sims Bayou at Houston, TX                         
## [19] 19: Vince Bayou at Pasadena, TX                       
## [20] 20: Hunting Bayou at IH 610, Houston, TX              
## [21] 21: Clear Ck at Mykawa St nr Pearland, TX             
## 21 Levels: 01: Cypress Ck at Sharp Rd nr Hockley, TX ...

Eyeball the marginal distribution of gage heights, looking for sketchy values:

hist(dat$height, col="gray")

Replace negative gage heights with NA:

sum(dat$height < 0)
## [1] 218
dat$height[dat$height < 0] <- NA
sum(dat$height < 0, na.rm=TRUE)
## [1] 0

Eyeball the time series for each site, with a vertical line dividing the “A” observations from the “P” observations (the latter are always the most recent observations):

# get the A vs. P bounds
bounds <- by(dat, dat$loc, function(x) max(x$date[x$code=="A"]))
bounds <- data.frame(bound=c(bounds))
bounds$loc <- rownames(bounds)

# plot
ggplot(dat, aes(x=date, y=height)) + geom_line() +
  facet_wrap(~loc, ncol=1) + theme(strip.text=element_text(size=7)) +
  geom_vline(aes(xintercept = bound), data=bounds)