Commit 5d409245 authored by Mike Bedington's avatar Mike Bedington

Initial commit

import netCDF4 as nc
import numpy as np
river_nc_filestr = ''
sediment_types = ['mud_1', 'mud_2', 'sand_1', 'sand_2']
def sediment_function(Q, cnst):
spm = cnst * ((Q/10) ** 1.2)
spm[Q < 10] = 5
return spm
river_nc = nc.Dataset(river_nc_filestr, 'r+')
time_dim = river_nc.dimensions['time'].size
river_flux = river_nc.variables['river_flux'][:]
river_name_list_raw = river_nc.variables['name_list_real'][:]
river_name_list = []
for this_row in river_name_list_raw:
river_name_list.append(str(b''.join(this_row), 'utf-8'))
spm_raw = []
for this_river, this_q in zip(river_name_list, river_flux.T):
if 'Tamar' in this_river:
cnst = 4.6
cnst = 3.2
spm_raw.append(sediment_function(this_q, cnst))
spm_val = np.vstack(spm_raw).T/1000
for this_sediment in sediment_types:
#this_sed_var = river_nc.createVariable(this_sediment, 'f4', ('time','rivers'))
#this_sed_var.long_name = 'Mud mud glorious mud: ' + this_sediment
#this_sed_var.units = 'kgm^-3'
if this_sediment == 'mud_1':
river_nc.variables[this_sediment][:] = spm_val
river_nc.variables[this_sediment][:] = np.zeros(np.shape(river_flux))
from fvcom_river.river import River, RiverMulti
from fvcom_river.ltls_functions import ltls_data_store, ltls_comp_river, nemoRiver, nemoMultiRiver
from fvcom_river.river_functions import *
# Script to batch download the CEH river flow data using the CEH river IDs (see
# The search has to be done in two goes (once for stations east of the
# Greenwich meridian, once for stations west of it. The results from those two
# searches are in ceh_rivers_info.csv.
# rivers_IRS_crossref.txt is a file from Karen Amoudry (originally from Laurent
# Amoudry) which gives the POLCOMS river data in the following format:
# POLCOMS ID | x | y | CEH ID | River flow weighting factor | River name
# Pierre Cazenave (Plymouth Marine Laboratory)
echo -n "" > positions.txt
# Main logic.
# Won't overwrite existing files but will extract their positions into the
# positions.txt file.
while read line; do
name=$(echo $line | cut -f2 -d,)
id=$(echo $line | cut -f1 -d,)
echo "Getting river $name ($id)"
if [ -f "$out" ]; then
echo ${id},${name}_${loc},$(\grep Reference $out | cut -f3 -d,) >> positions.txt
echo "skipping."
echo -n "saving to ${out}... "
# Start with a content length of 30 and increment by once until it
# works (set the upper content length limit at 40).
until [ $res -eq 0 -o $cl -eq 40 ]; do
get_data $id "$out" $cl
if [ -f "$out" ]; then
cl=$(($cl + 1))
echo "done."
# Extract the grid reference for this station
echo ${id},$(\grep Reference "$out" | cut -f3 -d,) | \
sed 's/,$//g' >> ${id}_loc.txt
# Strip off all the header info so that there is just a csv of data, date
sed '/^[0-9].*/!d' ${id}_raw.csv >> ${id}_data.csv
# Wait a realistic amount of time before trying the next one.
sleep $(echo "scale=0; ($RANDOM % 10) + 10" | bc -l)
done < $info
# Wrapper function around CURL to get the data. Takes three arguments:
# Output file name
# Content length (defaults to 31)
# A timeout of 10 seconds is set for the command.
if [ -z $contentlength ]; then
curl \
--silent \
--header 'Host:' \
--header 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0' \
--header 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' \
--header 'Accept-Language: en-GB,en;q=0.5' \
--header 'DNT: 1' \
--header "Referer:${id}&dt=gdf" \
--header 'Connection: keep-alive' \
--header 'Content-Type: application/x-www-form-urlencoded' \
--header "Content-Length: $contentlength" \
--max-time 10 \
--data-binary "db=nrfa_public&stn=${id}&dt=gdf" \
"${id}.csv" \
-o "$out" -L
#curl \
# --silent \
# --header 'Host:' \
# --header 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:23.0) Gecko/20100101 Firefox/23.0' \
# --header 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' \
# --header 'Accept-Language: en-gb,en;q=0.5' \
# --header 'DNT: 1' \
# --header "Referer:$id&dt=gdf" \
# --header 'Connection: keep-alive' \
# --header 'Content-Type: application/x-www-form-urlencoded' \
# --header "Content-Length: $contentlength" \
# --max-time 10 \
# -X POST \
# --data-binary "db=nrfa_public&stn=$id&dt=gdf" \
# "$id/data.csv" \
# -o "$out" -L
# Launch the main loop.
This diff is collapsed.
Functions relating to creating the neural networks for river flow. These are trained on data from the CEH river gauge database using forcing from WRF model (precipitation and temperature)
The key functions are:
runNNtrain - trains a neural network
baseline_model - the setup of the neural net
create_generic_nn - creates a neural net trained on data from multiple rivers (in river_dict), creating a generic model which can be applied for ungauged rivers
import numpy as np
import pickle
import datetime as dt
from scipy.stats.stats import pearsonr
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
def windowSum(data, window):
output = np.zeros([len(data),1])
output[0:window,0] = np.sum(data[0:window])
output[window:,0] = data[window:]
for i in range(1, window):
output[window:,0] = output[window:,0] + data[0+i:-window+i]
return output
def lagData(data, lag):
output = np.zeros([len(data),1])
output[0:lag,0] = data[0]
output[lag:,0] = data[0:-lag]
return output
def baseline_model(input_len, node_width):
# create model
model = Sequential()
model.add(Dense(node_width, input_dim=input_len, init='normal', activation='relu'))
model.add(Dense(1, init='normal'))
# Compile model
model.compile(loss='mean_squared_error', optimizer='adam')
return model
def modelErrorMetrics(preds, obs):
rmse = np.sqrt(((preds - obs)**2).mean())
print('RMSE - '+str(rmse))
corr = pearsonr(preds, obs)
print('Pearson correlation - '+str(corr[0]))
nss = 1 - np.mean((preds - obs)**2)/ np.mean((obs - np.mean(obs))**2)
print('Nash-Sutcliffe efficiency - '+str(nss))
return np.asarray([rmse, corr[0], nss])
def runNNtrain(train_flux, train_data, no_epochs):
# scale the data ready for neural net fitting
nn_scaler = StandardScaler()
train_data_scale = nn_scaler.transform(train_data)
# set up the neural net
nn_model = baseline_model(train_data.shape[1], train_data.shape[1])
# and train it,train_flux, nb_epoch=no_epochs, batch_size=5, verbose=0)
# output the model and the error metrics
return [nn_scaler, nn_model]
def nn_create_run_data(precipitation, temp, precipitation_sums_lags, temp_sums_lags):
nn_run_data = np.asarray([precipitation, temp]).T
for this_sum in precipitation_sums_lags[0]:
nn_run_data = np.append(nn_run_data, windowSum(nn_run_data[:,0], this_sum), axis=1)
for this_lag in precipitation_sums_lags[1]:
nn_run_data = np.append(nn_run_data, lagData(nn_run_data[:,0], this_lag), axis=1)
for this_sum in temp_sums_lags[0]:
nn_run_data = np.append(nn_run_data, windowSum(nn_run_data[:,1], this_sum), axis=1)
for this_lag in temp_sums_lags[1]:
nn_run_data = np.append(nn_run_data, lagData(nn_run_data[:,1], this_lag), axis=1)
return nn_run_data
def create_generic_nn(river_dict, generic_model_files=None, train_dates=None, pt_sums_lags=None):
if generic_model_files is None:
generic_model_files = ['generic_nn.h5', 'generic_nn_train']
if pt_sums_lags is None:
precipitation_sums_lags = [[7,14,21,30,60], [1,2,3,4,5,6]]
temp_sums_lags = [[7,14,28], [1,2,3,4,5,6]]
if train_dates is None:
train_dates = [dt.datetime(2005,1,1), dt.datetime(2007,12,30)]
# get the combined data from all the rivers in river_dict
all_flux = np.empty(1)
all_flux[:] = np.nan
all_train_data = np.empty([1,23])
all_train_data[:] = np.nan
# collate the training data from every river, appending the catchment size
for this_river in river_dict.values():
print('Adding data from ' + this_river.river_name)
[this_flux_data, this_train_data] = this_river.prepNeuralNetData(train_dates, True)[0:2]
this_train_data = np.append(this_train_data, np.ones([len(this_train_data),1])*this_river.catchment_area, axis=1)
all_flux = np.append(all_flux, this_flux_data)
all_train_data = np.append(all_train_data, this_train_data, axis=0)
except TypeError:
print(this_river.river_name + ': No flux data to add')
all_flux = all_flux[1:]
all_train_data = all_train_data[1:,:]
[generic_scaler, generic_model] = runNNtrain(all_flux, all_train_data, 500)
# save the model seperately as it don't like being pickled[0])
# and save out the rest of the relevant data
nn_dict = {'all_flux':all_flux, 'all_train_data':all_train_data, 'nn_scaler':generic_scaler}
with open(generic_model_files[1], 'wb') as output_file:
pickle.dump(nn_dict, output_file, pickle.HIGHEST_PROTOCOL)
return [generic_model_files[0], nn_dict]
from math import sqrt, pi, sin, cos, tan, atan2 as arctan2
import csv
def OStoLL(E,N):
#E, N are the British national grid coordinates - eastings and northings
a, b = 6377563.396, 6356256.909 #The Airy 180 semi-major and semi-minor axes used for OSGB36 (m)
F0 = 0.9996012717 #scale factor on the central meridian
lat0 = 49*pi/180 #Latitude of true origin (radians)
lon0 = -2*pi/180 #Longtitude of true origin and central meridian (radians)
N0, E0 = -100000, 400000 #Northing & easting of true origin (m)
e2 = 1 - (b*b)/(a*a) #eccentricity squared
n = (a-b)/(a+b)
#Initialise the iterative variables
lat,M = lat0, 0
while N-N0-M >= 0.00001: #Accurate to 0.01mm
lat = (N-N0-M)/(a*F0) + lat;
M1 = (1 + n + (5./4)*n**2 + (5./4)*n**3) * (lat-lat0)
M2 = (3*n + 3*n**2 + (21./8)*n**3) * sin(lat-lat0) * cos(lat+lat0)
M3 = ((15./8)*n**2 + (15./8)*n**3) * sin(2*(lat-lat0)) * cos(2*(lat+lat0))
M4 = (35./24)*n**3 * sin(3*(lat-lat0)) * cos(3*(lat+lat0))
#meridional arc
M = b * F0 * (M1 - M2 + M3 - M4)
#transverse radius of curvature
nu = a*F0/sqrt(1-e2*sin(lat)**2)
#meridional radius of curvature
rho = a*F0*(1-e2)*(1-e2*sin(lat)**2)**(-1.5)
eta2 = nu/rho-1
secLat = 1./cos(lat)
VII = tan(lat)/(2*rho*nu)
VIII = tan(lat)/(24*rho*nu**3)*(5+3*tan(lat)**2+eta2-9*tan(lat)**2*eta2)
IX = tan(lat)/(720*rho*nu**5)*(61+90*tan(lat)**2+45*tan(lat)**4)
X = secLat/nu
XI = secLat/(6*nu**3)*(nu/rho+2*tan(lat)**2)
XII = secLat/(120*nu**5)*(5+28*tan(lat)**2+24*tan(lat)**4)
XIIA = secLat/(5040*nu**7)*(61+662*tan(lat)**2+1320*tan(lat)**4+720*tan(lat)**6)
dE = E-E0
#These are on the wrong ellipsoid currently: Airy1830. (Denoted by _1)
lat_1 = lat - VII*dE**2 + VIII*dE**4 - IX*dE**6
lon_1 = lon0 + X*dE - XI*dE**3 + XII*dE**5 - XIIA*dE**7
#Want to convert to the GRS80 ellipsoid.
#First convert to cartesian from spherical polar coordinates
H = 0 #Third spherical coord.
x_1 = (nu/F0 + H)*cos(lat_1)*cos(lon_1)
y_1 = (nu/F0+ H)*cos(lat_1)*sin(lon_1)
z_1 = ((1-e2)*nu/F0 +H)*sin(lat_1)
#Perform Helmut transform (to go between Airy 1830 (_1) and GRS80 (_2))
s = -20.4894*10**-6 #The scale factor -1
tx, ty, tz = 446.448, -125.157, + 542.060 #The translations along x,y,z axes respectively
rxs,rys,rzs = 0.1502, 0.2470, 0.8421 #The rotations along x,y,z respectively, in seconds
rx, ry, rz = rxs*pi/(180*3600.), rys*pi/(180*3600.), rzs*pi/(180*3600.) #In radians
x_2 = tx + (1+s)*x_1 + (-rz)*y_1 + (ry)*z_1
y_2 = ty + (rz)*x_1 + (1+s)*y_1 + (-rx)*z_1
z_2 = tz + (-ry)*x_1 + (rx)*y_1 + (1+s)*z_1
#Back to spherical polar coordinates from cartesian
#Need some of the characteristics of the new ellipsoid
a_2, b_2 =6378137.000, 6356752.3141 #The GSR80 semi-major and semi-minor axes used for WGS84(m)
e2_2 = 1- (b_2*b_2)/(a_2*a_2) #The eccentricity of the GRS80 ellipsoid
p = sqrt(x_2**2 + y_2**2)
#Lat is obtained by an iterative proceedure:
lat = arctan2(z_2,(p*(1-e2_2))) #Initial value
latold = 2*pi
while abs(lat - latold)>10**-16:
lat, latold = latold, lat
nu_2 = a_2/sqrt(1-e2_2*sin(latold)**2)
lat = arctan2(z_2+e2_2*nu_2*sin(latold), p)
#Lon and height are then pretty easy
lon = arctan2(y_2,x_2)
H = p/cos(lat) - nu_2
#Uncomment this line if you want to print the results
#print [(lat-lat_1)*180/pi, (lon - lon_1)*180/pi]
#Convert to degrees
lat = lat*180/pi
lon = lon*180/pi
#Job's a good'n.
return lat, lon
def read_csv_unheaded(filename, cols):
output = []
for i in range(0,cols):
this_list = []
with open(filename, 'rt') as this_file:
this_file_data = csv.reader(this_file)
for row in this_file_data:
return output
def OS_text_convert(grid_ref_str):
grid_sqr = grid_ref_str[0:2]
ref = grid_ref_str[2:].strip()
ref_x = ref[0:int(len(ref)/2)]
ref_y = ref[int(len(ref)/2):]
print('Grid reference x and y different lengths')
ref_grid = OS_letter_conversion_list()
x_add = [ref_grid[2][x] for x, y in enumerate(ref_grid[0]) if y == grid_sqr][0]
y_add = [ref_grid[1][x] for x, y in enumerate(ref_grid[0]) if y == grid_sqr][0]
print('Grid 2-letter identifier not recognised')
if x_add == '0':
x_add = ''
if y_add == '0':
y_add = ''
new_grid = [float(x_add + ref_x) , float(y_add + ref_y)]
return new_grid
def OS_letter_conversion_list():
osl_list = [
return osl_list
import numpy as np
import netCDF4 as nc
import datetime as dt
from dateutil.relativedelta import relativedelta
import os
import csv
Functions related to retrieving WRF output.
It expects that the wrf data will be in month netcdf files as generated by There should also be a netcdf with the grid info (lat, lon, lat_u, lon_u, lat_v, lat_u, all_vars), by default this is though it could be included in any of the month netcdfs. The path to these is provided by wrf_directory.
def daterange(start_date, end_date, step):
for n in np.arange(0,(end_date - start_date).days + 1, step):
yield start_date + dt.timedelta(float(n))
def read_csv_dict_temp(filename):
output = {}
with open(filename, 'rb') as this_file:
this_file_data = csv.DictReader(this_file)
for row in this_file_data:
test_row = row
this_keys = test_row.keys()
for tk in this_keys:
this_entry = []
with open(filename, 'rb') as this_file:
this_file_data = csv.DictReader(this_file)
for row in this_file_data:
output[tk] = this_entry
return output
def get_WRF_info(wrf_directory, *args):
home_dir = os.getcwd()
if args:
info_file = args
info_file = ''
test_nc = nc.Dataset(info_file, 'r')
output = {'XLAT': np.squeeze(test_nc.variables['XLAT'][1,:,:]),
'XLON': np.squeeze(test_nc.variables['XLONG'][1,:,:]),
'XLAT_U': np.squeeze(test_nc.variables['XLAT_U'][1,:,:]),
'XLON_U': np.squeeze(test_nc.variables['XLONG_U'][1,:,:]),
'XLAT_V': np.squeeze(test_nc.variables['XLAT_V'][1,:,:]),
'XLON_V': np.squeeze(test_nc.variables['XLONG_V'][1,:,:]),
'ALL_VARS': test_nc.variables.keys()}
return output
# a function to read the reduced netcdfs of only the variables of interest as generated by ncecat (using script
def read_WRF_year_month(this_year, this_month, wrf_directory):
home_dir = os.getcwd()
data_output = {}
full_date_list = []
for each_date in daterange(dt.datetime(this_year,this_month,2,0,0,0), dt.datetime(this_year,this_month,1,21,0,0) + relativedelta(months=+1),0.125):
data_output['times'] = full_date_list
# load in the nc file which has just T2, RAINNC, and Times (as strings)
year_nc = nc.Dataset(str(this_year) + '_' + "%02d"%this_month + '', 'r')
times = year_nc.variables['Times'][:]
times_list_dt = [dt.datetime.strptime(str(b''.join(np.asarray(times)[i,:]).decode('UTF-8')),'%Y-%m-%d_%H:%M:%S') for i in range(0, len(times))]
# do the temp
data_output['T2'] = np.zeros([len(full_date_list), len(year_nc.dimensions['south_north']), len(year_nc.dimensions['west_east'])])
temp_series = year_nc.variables['T2'][:]
for this_row, this_date in enumerate(full_date_list):
if [x for x in times_list_dt if x == this_date]:
data_output['T2'][this_row,:,:] = [temp_series[x,:,:] for x,y in enumerate(times_list_dt) if y == this_date][0]
data_output['T2'][this_row,:,:] = np.nan
temp_series = None
# do the rain
data_output['RAINNC'] = np.zeros([len(full_date_list), len(year_nc.dimensions['south_north']), len(year_nc.dimensions['west_east'])])
rain_series = year_nc.variables['RAINNC'][:]
for this_row, this_date in enumerate(full_date_list):
if [x for x in times_list_dt if x == this_date]:
data_output['RAINNC'][this_row,:,:] = [rain_series[x,:,:] for x,y in enumerate(times_list_dt) if y == this_date][0]
data_output['RAINNC'][this_row,:,:] = np.nan
rain_series = None
# head back to the starting directory
# and give out the data
return data_output
# Deprecated version for reading individual variables. There's a flaw in on of the C libraries for the python netCDF4 package which make doing it by loading daily files a baaaaad idea (will crash after ~ 400)
def read_WRF_var(*args, **kwargs):
*args - the spatial request, either 'all', or a
**kwargs -
'date_to_retrieve' - retrieve a single date. Should be as a string of the form e.g. '2000-01-01'
'start_date' - retrieves between this date and a given 'end_date' will return empty if no end date specified. Dates should be a string as above
'vars' -
# grab working directory to return to at the end
home_dir = os.getcwd()
data_output = {}
# parse the inputs and set up the list of relevant dates and empty arrays in an output dictionary for the variables
date_list = []
if 'date_to_retrieve' in kwargs:
date_list.append(dt.datetime.strptime(kwargs['date_to_retrieve'], '%Y-%m-%d'))
elif 'start_date' in kwargs:
if 'end_date' in kwargs:
for each_date in daterange(dt.datetime.strptime(kwargs['start_date'],'%Y-%m-%d'), dt.datetime.strptime(kwargs['end_date'],'%Y-%m-%d'),1):
full_date_list = []
for each_date in daterange(dt.datetime.strptime(kwargs['start_date']+' 21:00:00','%Y-%m-%d %H:%M:%S'), dt.datetime.strptime(kwargs['end_date']+' 18:00:00','%Y-%m-%d %H:%M:%S') + dt.timedelta(days=1),0.125):
data_output['times'] = full_date_list
print('No end date specified')
print('No dates requested')
# change to correct year directory, currently only copes with one year to prevent having to change directory halfway through a loop