forked from BjerknesClimateDataCentre/eshape
-
Notifications
You must be signed in to change notification settings - Fork 0
/
importSOCAT.py
218 lines (183 loc) · 7.92 KB
/
importSOCAT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
# SOCAT import. From local .tsv file or ERDDAP
import os
import pandas as pd
import time
from erddapy import ERDDAP
##### -----------------------------
# Don't touch from here
file = source+'.tsv'
SOCATcolheadertextshort = 'Expocode\tversion\tSource_DOI\tQC_Flag'
SOCATmetacolheadertextshort = 'Expocode\tversion\tDataset'
# Dictionaries
flagaccuracy = {"A": 2.0, "B": 2.0, "C": 5.0, "D": 5.0, "E": 10.0}
# Read from local files (synthesis + flagE)
if (datafrom == 'local'):
SOCATmetacolheadertextshort = 'Expocode\tversion\tDataset'
SOCATcolheadertextshort = 'Expocode\tversion\tSource_DOI\tQC_Flag'
SOCAT_files = os.listdir(os.path.join(input_dir, 'SOCAT'))
for file in SOCAT_files:
filepath = os.path.join(input_dir, 'SOCAT', file)
# Read metadata header for Cruise Flags and find the number of
# headerlines before the data
separator = '\t'
line = ''
headerlines = -1
metaline = ''
metaheaderlines = -1
f = open(filepath)
while SOCATmetacolheadertextshort not in metaline:
metaline = f.readline()
metaheaderlines = metaheaderlines + 1 # Where metadata lines start
# Find SOCAT collection DOI, while reading the file. The DOI is
# wrong AND have to remove the \n!!
#if ('DOI of the entire SOCAT collection:' in metaline):
# socatdoi = metaline.rsplit(' ', 1)[1]
endmetaheaderlines = metaheaderlines
while '\t' in metaline:
metaline = f.readline()
endmetaheaderlines = endmetaheaderlines + 1 # End of metadata lines
# Create metadata dataframe
metainfoAD = pd.read_csv(filepath, sep = '\t',
skiprows = metaheaderlines,
nrows = endmetaheaderlines - metaheaderlines - 1)
# Find where data columns start
headerlines = headerlines + endmetaheaderlines
while SOCATcolheadertextshort not in line:
line = f.readline()
headerlines = headerlines + 1
f.close()
# Read SOCAT data in dataframe
print(file + " file has " + str(headerlines) + " header lines")
start_time = time.time() # Time the script
ddtype = {0: str, 2: str} # add type str to columns 0 and 2
# Read the SOCAT file into a pandas dataframe
tempdf1 = pd.read_csv(filepath, sep = separator, skiprows = headerlines,
dtype = ddtype)
print("--- %s seconds ---" % (time.time() - start_time))
print(file + " data frame has " + str(len(tempdf1)) + " lines")
# Give all the same variable names
tempdf1.rename(
columns = {'Expocode': vardict['id'],
'Source_DOI': vardict['doi'],
'latitude [dec.deg.N]': vardict['lat'],
'longitude [dec.deg.E]': vardict['lon'],
'sample_depth [m]': vardict['dep'],
'SST [deg.C]': vardict['temp'],
'sal': vardict['sal'],
'fCO2rec [uatm]': vardict['fco2w'],
'fCO2rec_flag': vardict['fco2wf']},
inplace=True)
# Create date python object
tempdtframe = pd.DataFrame(
{'year': tempdf1['yr'], 'month': tempdf1['mon'],
'day': tempdf1['day'], 'hour': tempdf1['hh'],
'minute': tempdf1['mm'], 'seconds': tempdf1['ss']})
tempdf1['DATEVECTOR1'] = pd.to_datetime(tempdtframe, utc = True)
# Transform longitude to +-180
tempdf1.loc[tempdf1[vardict['lon']] > 180, vardict['lon']] = tempdf1[
vardict['lon']] - 360
# Subset the dataset HERE (cruise flag assignment takes A LONG TIME).
if subset :
tempdf1 = tempdf1[
(tempdf1['DATEVECTOR1'] >= pd.to_datetime(mindate)) &
(tempdf1['DATEVECTOR1'] <= pd.to_datetime(maxdate)) &
(tempdf1[vardict['lat']] >= minlat) &
(tempdf1[vardict['lat']] <= maxlat) &
(tempdf1[vardict['lon']] >= minlon) &
(tempdf1[vardict['lon']] <= maxlon)].copy()
tempdf1.reset_index(drop = True, inplace = True)
print(len(tempdf1))
# Cruise flags / accuracies
if ('FlagE' in file): # Pointless to loop when all has the same flag
tempdf1['Cruise_flag'] = 'E'
elif ("FlagE" not in file):
# Read cruise flag from metadata lines in SOCAT synthesis A-D
tempdf1['Cruise_flag'] = 'X'
#tempdf[vardict['fco2wac']] = 0.0
counter = 0
#allexpoflags = tempdf[['Expocode', 'Cruise_flag']]
#print(allexpoflags.shape)
# Assign Cruise flags A-D
start_time = time.time()
for expocode in metainfoAD['Expocode']:
cruiseflag = metainfoAD['QC Flag'][metainfoAD[
'Expocode'] == expocode]
# this line is very slow
tempdf1['Cruise_flag'].values[
tempdf1[vardict['id']] == expocode] = cruiseflag
counter = counter + 1
# print(counter)
print("--- %s seconds for cruise flag assignment ---"
% (time.time() - start_time))
# Merge synthesis and FlagE dataframes in one
if 'tempdf' not in globals():
tempdf = tempdf1
else:
tempdf = tempdf.append(tempdf1)
# If "remote"
elif (datafrom == 'remote'):
e = ERDDAP(
server = 'https://data.pmel.noaa.gov/socat/erddap',
protocol = 'tabledap',
)
e.response = 'csv'
e.dataset_id = 'socat_v2021_fulldata'
e.constraints = {
# 'dist_to_land>=': 10
# 'region_id=': A,C,I,N,O,R,T,Z
# 'expocode=':'74AB19900918',
'time>=': mindate,
'time<=': maxdate,
'latitude>=': minlat,
'latitude<=': maxlat,
'longitude>=': minlon,
'longitude<=': maxlon,
'WOCE_CO2_water=': "2"
#synthesis file only has good data (keep questionable/bad?)
# 'fCO2_water_sst_100humidity_uatm=~':"float('nan')"
# Have yet to figure out how to set the nan filter
}
e.variables = ['expocode', 'time', 'latitude', 'longitude', 'depth', 'sal',
'temp', 'fCO2_recommended', 'qc_flag', 'WOCE_CO2_water',
'socat_doi']
tempdf = e.to_pandas(dtype = {10: str, 8: str, 0: str})
# Retain only valid fco2 values (can't figure out how to do it in erdappy
# constrains yet)
tempdf = tempdf.dropna(subset = ['fCO2_recommended (uatm)']).copy()
tempdf.reset_index(drop = True, inplace = True)
# Rename columns
tempdf.rename(
columns = {'expocode': vardict['id'],
'socat_doi': vardict['doi'],
'latitude (degrees_north)': vardict['lat'],
'longitude (degrees_east)': vardict['lon'],
'depth (m)': vardict['dep'],
'temp (degrees C)': vardict['temp'],
'sal (PSU)': vardict['sal'],
'fCO2_recommended (uatm)': vardict['fco2w'],
'WOCE_CO2_water': vardict['fco2wf'],
'qc_flag':'Cruise_flag'},
inplace = True)
# Create python date object
tempdf['DATEVECTOR1'] = pd.to_datetime(tempdf['time (UTC)'])
# Create UNIXDATE and ISO DATEVECTOR
tempdf[vardict['unixd']] = tempdf['DATEVECTOR1'].astype('int64') // 10 ** 9
tempdf[vardict['datevec']] = tempdf[
'DATEVECTOR1'].dt.strftime('%Y-%m-%dT%H:%M:%SZ')
# Assign accuracies following cruise flags
tempdf[vardict['fco2wac']] = 0.0
for key in flagaccuracy:
tempdf[vardict['fco2wac']].values[
tempdf['Cruise_flag'] == key] = flagaccuracy[key]
# Flag fco2 as measured
tempdf[vardict['fco2wc']] = 0
# Estimate alkalinity from salinity, and then, estimate ph and dic
# Assign SOCAT DOI if Source DOI is missing
tempdf.loc[tempdf[vardict['doi']].isna(), vardict['doi']] = socatdoi
# Add source (SOCAT, GLODAP, ARGO, etc...)
tempdf['SOURCE'] = source
# Rename and reset indices
printdf = tempdf
printdf.reset_index(drop=True, inplace=True)
print('SOCAT frame size is ')
print(printdf.shape)