#!/usr/bin/env python # coding: utf-8 # # tmax/tmin netCDF manipulation # import packages to open/write to netCDF (may need to pip install netCDF4) import netCDF4 import numpy as np import datetime as dt import cftime # for installing/importing xarray in lieu of netCDF4 # may need to do conda install depending on environment import sys #!conda install --yes --prefix {sys.prefix} xarray import xarray as xr import dask # pandas for array manipulation and datetime, reduce to merge dfs import pandas as pd from functools import reduce # for plotting, likely will not be required for finalized scripts #import cartopy.crs as ccrs #import matplotlib.pyplot as plt #import matplotlib.patches as mpatches ##from pandas.plotting import register_matplotlib_converters #register_matplotlib_converters() # for manipulating directories, merging files import os import glob # for logging import logging # for viewing entire pandas dataframes #d.set_option("display.max_rows", None) # ## Input Variables - to be modified as needed ## date ranges # create datetimeindex ranging from pdStart = 1981-01-01 : pdEnd = 2000-12-31 pdStart = '1991-01-01' pdEnd = '2020-12-31' pdDates = pd.date_range(start = pdStart, end = pdEnd) # monthly datetimeindex for pdDates pdMonths = pd.date_range(start = pdStart, end = pdEnd, freq="MS") # leap year range for pdDates, using 2/29 dates leapDates = pd.date_range(start = pdStart, end = pdEnd, freq="Y") leapDates = leapDates[leapDates.is_leap_year] leapDates = leapDates.map(lambda x: x.replace(month=2, day=29)) ## nc file input locations # these need to update for relative to local compute infrastructure. # tmax print("These will need to change") tmax_in = './PRISM_Data/T_Max_1991_2020.nc' # tmin tmin_in = './PRISM_Data/T_Min_1991_2020.nc' # tmax_red_threshold/ tmax_red_in = './HeatRisk.v1.5.Red.MaxT.nc' print(tmax_in) print(tmin_in) print(tmax_red_in) # specify output directory root_out = './CHMO/Outputs' logName = root_out + '/' + dt.date.today().strftime("%m_%d_%Y") + '.log' logging.basicConfig(filename=logName,level=logging.DEBUG) # ### datasets read # tmax read #open as xr dataset # cache = False saves memory #tmax_DS = xr.open_dataset(tmax_in, chunks={"lat": 1000, "lon": 1000, "time":1000},cache = False) tmax_DS = xr.open_dataset(tmax_in, cache = False) print(tmax_DS)#.values) #test = 'E:/T_Max_1981_2010_Compress.nc' #max_ds = xr.open_dataset(test, cache = False) #chunks={"lat": 621, "lon": 1405, "time": 365}, #latIn = [48.375] #lonIn = [-124.7916667] #lats = [30.893] #lons = [-103.393] #dsloc_max = max_ds.sel(lon=lonIn,lat=latIn,method='nearest') #print(dsloc_max.compute()) #print(tmax_DS) # tmin read #open as xr dataset # cache = False saves memory tmin_DS = xr.open_dataset(tmin_in, cache = False) print(tmin_DS) #chunks={"lat": 621, "lon": 1405, "time": 365}, # tmax red thresh read # open as xr dataset tmax_red = xr.open_dataset(tmax_red_in, cache = False) print(tmax_red) # ## create output subdirectories # In[36]: # output locations, make subdirectories if they do not exist month_dir = root_out + '/Monthly' if not os.path.exists(month_dir): os.makedirs(month_dir) season_dir = root_out + '/Seasonal' if not os.path.exists(season_dir): os.makedirs(season_dir) # ## cell iteration # In[ ]: #%%timeit #import time #start = time.time() #"the code you want to test stays here" #%load_ext line_profiler #import pandas as pd #import modin.pandas as pd # sample vals #lats = [36] #[28.5, 36, 44.5] #lons = [-108] #[-99.5, -108, -115] #from dask.distributed import Client #client = Client() ####create subfiles for each parallel processes ######################################## base_season_file = pd.DataFrame(columns = ['season', 'lat', 'lon', 't_ave_max', 'h_exc_min', 'h_exc_mean', 'h_exc_max', 'WC_min', 'WC_max', 'WC_mean', 'dur_min', 'dur_max', 'dur_mean', 'u_max_max', 'u_max_mean','u_min_max', 'u_min_mean', 'intsy_min', 'intsy_max', 'intsy_mean', "season_ter"]) base_month_file = pd.DataFrame(columns = ['month', 'lat', 'lon', 't_ave_max', 'h_exc_min', 'h_exc_mean', 'h_exc_max', 'WC_min', 'WC_max', 'WC_mean', 'dur_min', 'dur_max', 'dur_mean', 'u_max_max', 'u_max_mean','u_min_max', 'u_min_mean', 'intsy_min', 'intsy_max', 'intsy_mean', "month_ter"]) monthly_out = month_dir + '/' + 'Monthly_All.csv' seasonal_out = season_dir + '/' + 'Seasonal_All.csv' base_month_file.to_csv(monthly_out, index = False) base_season_file.to_csv(seasonal_out, index = False) # actual list of vals lats = tmax_DS.lat.values lons = tmax_DS.lon.values #lats = [48.16833] #lons = [-124.7916666666665] #length.append(len(a)) #print(dview["coords"][0],dview["coords"][1] ) #lats = tmax_DS.lat.values #lons = tmax_DS.lon.values ###Chris subset Code #lats = [30.893] #lons = [-103.393] #lats = [48.375] #lons = [-124.7916667] #lats =[48.166666666665] #lons = [-124.7916666666665] #%timeit ####try vectorization options to calculate results for entire array not using a for loop which slows things down #tmax_DS #drop_nulls = tmax_DS.where(ds.tmpmax != -9999, drop = True) for x in lons: for y in lats: #print('Processing', x, y) # insert all iterations ## subset tmax_DS and tmax_red_DS to Phoenix # set grid location lonIn = x latIn = y # look into putting a constraint on using nearest? i.e. if nearest is way too far away do what? # tmax dsloc_max = tmax_DS.sel(lon=lonIn,lat=latIn,method='nearest') #dsloc_max = tmax_DS.sel(lon=lonIn,lat=latIn) #tmin dsloc_min = tmin_DS.sel(lon=lonIn,lat=latIn, method='nearest') #dsloc_min = tmin_DS.sel(lon=lonIn,lat=latIn) # threshold dsloc_red = tmax_red.sel(lon=lonIn,lat=latIn, method='nearest') #dsloc_red = tmax_red_DS.sel(lon=lonIn,lat=latIn) # skip the cell if values are null in either dsloc_max (all 0s) or dsloc_red (-9999) if(np.all(dsloc_max['tmpmax'].values == 0) or np.all(dsloc_red['tmax_p95'].values == -9999)): #print('Skipping cell', lonIn, ' ', latIn, ' due to null cell.') logging.info(('Skipping cell ' + str(lonIn) + ', ' + str(latIn) + ' due to null cell.')) continue # convert the tmax_p95 from Fahrenheit to Celsius dsloc_red['tmax_p95_c'] = ((dsloc_red['tmax_p95'] - 32) * 5.0/9.0) dsloc_red = dsloc_red.drop('tmax_p95') ## convert thresh to pandas dataframe for extending datetime df_red = dsloc_red.to_dataframe() # convert -5572.77832 back to -9999 df_red[df_red['tmax_p95_c'] < -5000] = -9999 # pull this out of the for statement # for loop through all tmax_red df rows for index, row in df_red.iterrows(): #print(index, row['lat']) for date in pdDates: if(date.day == index.day and date.month == index.month): #print("Matched", date, "to", index) df_red.loc[pd.to_datetime(date), ['tmax_p95_c']] = row['tmax_p95_c'] # assign all lat/lon to appropriate: latitude 33.5°, longitude -112° df_red = df_red.assign(lat = latIn, lon = lonIn) ## create min/max dataframe, join min, left join max and red, accounting for potential leap days # build max df df_max = dsloc_max.to_dataframe() print("df_max - initial") print(df_max) # build min df df_min = dsloc_min.to_dataframe() # join df_min to df_max df_max = df_max.join(df_min['tmpmin'], on = 'time', how = 'left') # merge red threshold tmax_p95_c on time, lat/lon already aligned df_max = df_max.join(df_red['tmax_p95_c'], on = 'time', how = 'left') # check joined dataframe for any missing leap year threshold values - pull from feb 28th value for date in leapDates: feb28 = date - dt.timedelta(days=1) if df_max.index is date: df_max.at[date, 'tmax_p95_c'] = df_max.at[feb28, 'tmax_p95_c'] ## tmpmax/min averaging by month and 3month monthly_averages = pd.DataFrame() monthly_averages['tmpmax_ave'] = df_max['tmpmax'].resample('MS').mean() monthly_averages['tmpmin_ave'] = df_max['tmpmin'].resample('MS').mean() monthly_averages['tmp_ave'] = monthly_averages.mean(axis=1) # # create 3 month window empty df rolling_averages = pd.DataFrame() # each month 'onset' is the month+2, so 06-30 = June/July/Aug window # sort dates latest to earliest, then take rolling where there is at least 1 non.na value, perform an aggregate method then resort from earliest-latest # average temp rolling_averages['tmp_ave'] = monthly_averages['tmp_ave'].sort_index(ascending=False).rolling(3, min_periods = 1).mean().sort_index(ascending=True) ## heatwave assignment # assign heatwave values # heat exceeded threshold: where tmpmax >= tmax_p95_c df_max['heat_exceed'] = np.where(df_max['tmpmax'] >= df_max['tmax_p95_c'], 1, 0) # intensity: where tmpmax >= tmax_p95_c df_max['intensity'] = np.where(df_max['tmpmax'] >= df_max['tmax_p95_c'], df_max['tmpmax'] - df_max['tmax_p95_c'], np.NaN) # if no days exceed heat threshold, continue through rest of loop, ####Chris, remove to output missing 15% of cells if df_max[df_max['heat_exceed'] > 0].empty: print('No days exceed heat threshold, skipping cell.') logging.info(('No days exceed heat threshold for ' + str(lonIn) + ', ' + str(latIn) + ', skipping cell.')) continue # head exceedance by month and 3month monthly_averages['heat_exceeds'] = df_max['heat_exceed'].resample('MS').sum() rolling_averages['heat_exceeds'] = monthly_averages['heat_exceeds'].sort_index(ascending=False).rolling(3, min_periods = 1).sum().sort_index(ascending=True) # find heatwave events and flag days as part of an event # * [code inspiration](https://stackoverflow.com/questions/40555036/pandas-flag-consecutive-values) # create "moving window" wave_prev = df_max.shift(1)['heat_exceed'] wave_next = df_max.shift(-1)['heat_exceed'] wave_next2 = df_max.shift(-2)['heat_exceed'] # creates separate df df_waves = pd.DataFrame( dict(start = np.flatnonzero((wave_prev != 1) & (df_max['heat_exceed'] == 1) & (wave_next == 1)), end = np.flatnonzero((df_max['heat_exceed'] == 1) & (wave_next == 0) & (wave_prev == 1)))) # get length of heat wave event (add 1 for first index val) df_waves['length'] = 1 + df_waves['end'] - df_waves['start'] # for every heatevent in df_waves, add information back to df_max # set evenNum to 1 for starting event eventNum = 1 curYear = 0 for sInd, wavelen in zip(df_waves['start'], df_waves['length']): # name heat event annual sequence number by: DATE_EVENTNUM_LAT_LON # if same year, add to event number else restart for new year if df_max.index[sInd].year == curYear: eventNum +=1 # string format with leading 0 eventStr = str(eventNum).zfill(2) else: eventNum = 1 eventStr = str(eventNum).zfill(2) heat_wave = (str(eventStr) + "_" + str(df_max.index[sInd].year) + "_" + str(df_max.iloc[sInd]['lon']) + "_" + str(df_max.iloc[sInd]['lat'])) print ("heat_wave"+str(heat_wave)) # iterate through all days in heat event curInd = sInd for i in range(0, wavelen): df_max.loc[[df_max.index[curInd]],'heat_wave'] = heat_wave df_max.loc[[df_max.index[curInd]],'event_dur'] = wavelen df_max.loc[[df_max.index[curInd]],'event_on'] = df_max.index[sInd] df_max.loc[[df_max.index[curInd]],'event_end'] = df_max.index[(sInd+wavelen-1)] curInd += 1 curYear = df_max.index[sInd].year ## calculate statistics # group by heat event print("df_max - second, just by df_events") df_max.to_csv('./df_max.csv',index=False) df_events = df_max.groupby('heat_wave') try: df_events = df_max.groupby('heat_wave') except: logging.info(('No heat wave events for cell ' + str(lonIn) + ', ' + str(latIn) + ', skipping cell.')) print(df_events) # list of statistics to include dfs = [df_events['event_dur','event_on','event_end'].agg(['max']), df_events['tmpmax'].agg(['max', 'mean']), df_events['tmpmin'].agg(['max', 'mean']), df_events['intensity'].agg(['sum'])] # merge on heat_event df_out = reduce(lambda left,right: pd.merge(left,right,on='heat_wave'), dfs) # add lat/lon df_out = df_out.assign(lat = latIn, lon = lonIn) # rename columns cols = { ('event_dur', 'max'): 'duration', ('event_on', 'max'): 'onset', ('event_end', 'max') : 'end', 'max_x': 'ult_max', 'mean_x': 'mean_max', 'max_y': 'ult_min', 'mean_y' : 'mean_min', 'sum' : 'intensity'} df_out = df_out.rename(columns = cols) ## windowing # monthly # create df with onset as datetimeindex, using onset as index df_onset = df_out.set_index('onset') # create monthly summary, empty the df monthly_summary = pd.DataFrame() # create monthly summary info using resampling # NOTE: only considers onset date when doing resampling, does not consider dates in the end-date period (fix somehow?) # could create duplicate entry in df_onset whenever df_onset['end'] month != df_onset['onset'] month # resample by month start, doing aggregate statistics monthly_summary['wave_count'] = df_onset['duration'].resample('MS').count() monthly_summary['duration_sum'] = df_onset['duration'].resample('MS').sum() # sum of durations monthly_summary['duration_mean'] = df_onset['duration'].resample('MS').mean() # mean of durations monthly_summary['ult_max'] = df_onset['ult_max'].resample('MS').max() # max of ult_max monthly_summary['mean_max'] = df_onset['mean_max'].resample('MS').mean() # mean of mean_max monthly_summary['ult_min'] = df_onset['ult_min'].resample('MS').max() # max of ult_min monthly_summary['mean_min'] = df_onset['mean_min'].resample('MS').mean() # mean of mean_min monthly_summary['intensity'] = df_onset['intensity'].resample('MS').sum().replace(0, np.nan) # sum of event intensity # fill missing months as np.nan except for wave_count, duration_sum which have 0 fill monthly_summary = monthly_summary.reindex(pdMonths, fill_value=np.nan) monthly_summary['wave_count'] = monthly_summary['wave_count'].fillna(0) monthly_summary['duration_sum'] = monthly_summary['duration_sum'].fillna(0) # append tmpmax/min averaging monthly_summary = monthly_summary.join(monthly_averages['tmp_ave'], how = 'left') # append heat exceedance days # is left join correct? monthly_summary = monthly_summary.join(monthly_averages['heat_exceeds'], how = 'left') # assign lat/lon monthly_summary = monthly_summary.assign(lat = latIn, lon = lonIn) # set index name monthly_summary.index.name = 'month' # 3 month rolling # create 3 month window empty df rolling_summary = pd.DataFrame() # each month 'onset' is the month+2, so 06-30 = June/July/Aug window # sort dates latest to earliest, then take rolling where there is at least 1 non.na value, perform an aggregate method then resort from earliest-latest # wave count rolling_summary['wave_count'] = monthly_summary['wave_count'].sort_index(ascending=False).rolling(3, min_periods = 1).sum().sort_index(ascending=True) # sum of durations rolling_summary['duration_sum'] = monthly_summary['duration_sum'].sort_index(ascending=False).rolling(3, min_periods = 1).sum().sort_index(ascending=True) # mean of durations rolling_summary['duration_mean'] = monthly_summary['duration_mean'].sort_index(ascending=False).rolling(3, min_periods = 1).mean().sort_index(ascending=True) # max of ult_max rolling_summary['ult_max'] = monthly_summary['ult_max'].sort_index(ascending=False).rolling(3, min_periods = 1).max().sort_index(ascending=True) # mean of mean_max rolling_summary['mean_max'] = monthly_summary['mean_max'].sort_index(ascending=False).rolling(3, min_periods = 1).mean().sort_index(ascending=True) # max of ult_min rolling_summary['ult_min'] = monthly_summary['ult_min'].sort_index(ascending=False).rolling(3, min_periods = 1).max().sort_index(ascending=True) # mean of mean_min rolling_summary['mean_min'] = monthly_summary['mean_min'].sort_index(ascending=False).rolling(3, min_periods = 1).mean().sort_index(ascending=True) # sum of event intensity rolling_summary['intensity'] = monthly_summary['intensity'].sort_index(ascending=False).rolling(3, min_periods = 1).sum().sort_index(ascending=True) # append tmpmax/min averaging rolling_summary = rolling_summary.join(rolling_averages['tmp_ave'], how = 'left') # append heat exceedance days rolling_summary = rolling_summary.join(rolling_averages['heat_exceeds'], how = 'left') # assign lat/lon rolling_summary = rolling_summary.assign(lat = latIn, lon = lonIn) # rename index # set index name rolling_summary.index.name = 'season' ## binning into terciles # monthly # create dfs for every monthly_summary by month dfJan = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) dfFeb = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) dfMar = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) dfApr = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) dfMay = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) dfJun = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) dfJul = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) dfAug = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) dfSep = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) dfOct = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) dfNov = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) dfDec = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) # sort each month into appropriate df #print(monthly_summary.dtypes) #print(monthly_summary[monthly_summary.index.month==1]) #test = dfJan.apply(lambda x: .append(monthly_summary.index.month == 1, axis = 1)) dfJan = dfJan.append(monthly_summary[monthly_summary.index.month==1]) dfFeb = dfFeb.append(monthly_summary[monthly_summary.index.month==2]) dfMar = dfMar.append(monthly_summary[monthly_summary.index.month==3]) dfApr = dfApr.append(monthly_summary[monthly_summary.index.month==4]) dfMay = dfMay.append(monthly_summary[monthly_summary.index.month==5]) dfJun = dfJun.append(monthly_summary[monthly_summary.index.month==6]) dfJul = dfJul.append(monthly_summary[monthly_summary.index.month==7]) dfAug = dfAug.append(monthly_summary[monthly_summary.index.month==8]) dfSep = dfSep.append(monthly_summary[monthly_summary.index.month==9]) dfOct = dfOct.append(monthly_summary[monthly_summary.index.month==10]) dfNov = dfNov.append(monthly_summary[monthly_summary.index.month==11]) dfDec = dfDec.append(monthly_summary[monthly_summary.index.month==12]) ''' for index, row in monthly_summary.iterrows(): if index.month == 1: dfJan = dfJan.append(row) if index.month == 2: dfFeb = dfFeb.append(row) if index.month == 3: dfMar = dfMar.append(row) if index.month == 4: dfApr = dfApr.append(row) if index.month == 5: dfMay = dfMay.append(row) if index.month == 6: dfJun = dfJun.append(row) if index.month == 7: dfJul = dfJul.append(row) if index.month == 8: dfAug = dfAug.append(row) if index.month == 9: dfSep = dfSep.append(row) if index.month == 10: dfOct = dfOct.append(row) if index.month == 11: dfNov = dfNov.append(row) if index.month == 12: dfDec = dfDec.append(row) ''' # add month dfs to list monthList = [dfJan, dfFeb, dfMar, dfApr, dfMay, dfJun, dfJul, dfAug, dfSep, dfOct, dfNov, dfDec] thirty_year_month_List = [dfJan, dfFeb, dfMar, dfApr, dfMay, dfJun, dfJul, dfAug, dfSep, dfOct, dfNov, dfDec] # output location for tercile'd dfs tercileMonths = [] # for every month df in monthList for index, frame in enumerate(monthList): # sort ascending by tmp_ave monthList[index] = frame.sort_values(by=['tmp_ave']) # extend tercileList with split monthList[index] tercileMonths.extend(np.array_split(monthList[index],3)) #print(tercileMonths) # 3 month rolling # utilize code from monthly, apply to 3 month rolling df # create dfs for every monthly_summary by month dfJan = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) dfFeb = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) dfMar = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) dfApr = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) dfMay = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) dfJun = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) dfJul = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) dfAug = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) dfSep = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) dfOct = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) dfNov = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) dfDec = pd.DataFrame(columns=['wave_count', 'duration_sum', 'duration_mean', 'ult_max', 'mean_max', 'ult_min', 'mean_min', 'intensity', 'lat', 'lon']) # sort each month into appropriate df dfJan = dfJan.append(rolling_summary[monthly_summary.index.month==1]) dfFeb = dfFeb.append(rolling_summary[monthly_summary.index.month==2]) dfMar = dfMar.append(rolling_summary[monthly_summary.index.month==3]) dfApr = dfApr.append(rolling_summary[monthly_summary.index.month==4]) dfMay = dfMay.append(rolling_summary[monthly_summary.index.month==5]) dfJun = dfJun.append(rolling_summary[monthly_summary.index.month==6]) dfJul = dfJul.append(rolling_summary[monthly_summary.index.month==7]) dfAug = dfAug.append(rolling_summary[monthly_summary.index.month==8]) dfSep = dfSep.append(rolling_summary[monthly_summary.index.month==9]) dfOct = dfOct.append(rolling_summary[monthly_summary.index.month==10]) dfNov = dfNov.append(rolling_summary[monthly_summary.index.month==11]) dfDec = dfDec.append(rolling_summary[monthly_summary.index.month==12]) ''' for index, row in rolling_summary.iterrows(): if index.month == 1: dfJan = dfJan.append(row) if index.month == 2: dfFeb = dfFeb.append(row) if index.month == 3: dfMar = dfMar.append(row) if index.month == 4: dfApr = dfApr.append(row) if index.month == 5: dfMay = dfMay.append(row) if index.month == 6: dfJun = dfJun.append(row) if index.month == 7: dfJul = dfJul.append(row) if index.month == 8: dfAug = dfAug.append(row) if index.month == 9: dfSep = dfSep.append(row) if index.month == 10: dfOct = dfOct.append(row) if index.month == 11: dfNov = dfNov.append(row) if index.month == 12: dfDec = dfDec.append(row) ''' # add month dfs to list seasonList = [dfJan, dfFeb, dfMar, dfApr, dfMay, dfJun, dfJul, dfAug, dfSep, dfOct, dfNov, dfDec] thirty_year_season_List = [dfJan, dfFeb, dfMar, dfApr, dfMay, dfJun, dfJul, dfAug, dfSep, dfOct, dfNov, dfDec] # output location for tercile'd dfs tercileSeasons = [] # for every season df in seasonList for index, frame in enumerate(seasonList): # sort ascending by tmp_ave seasonList[index] = frame.sort_values(by=['tmp_ave']) # extend tercileList with split seasonList[index] tercileSeasons.extend(np.array_split(seasonList[index],3)) ## tercile statistics # monthly # list to insert values into terVals = [] # month tercile stats for index, frame in enumerate(tercileMonths): # create df for populating stats terVals.insert(index, {'month' : tercileMonths[index].index[0].month, 't_ave_max' : tercileMonths[index]['tmp_ave'].max(), # max ave temp in ter 'h_exc_min' : tercileMonths[index]['heat_exceeds'].min(), # min heat exceed days in ter 'h_exc_mean' : tercileMonths[index]['heat_exceeds'].mean(), # mean heat exceed days in ter 'h_exc_max' : tercileMonths[index]['heat_exceeds'].max(), # max heat exceed days in ter 'WC_min' : tercileMonths[index]['wave_count'].min(), # min heat wave in ter 'WC_max' : tercileMonths[index]['wave_count'].max(), # max heat wave in ter 'WC_mean' : tercileMonths[index]['wave_count'].mean(), # mean heat wave in ter 'dur_min' : tercileMonths[index]['duration_sum'].min(), # min heat wave days in ter 'dur_max' : tercileMonths[index]['duration_sum'].max(), # max heat wave days in ter 'dur_mean' : tercileMonths[index]['duration_sum'].mean(), # mean heat wave days in ter 'u_max_max' : tercileMonths[index]['ult_max'].max(), # max of ult_max in ter 'u_max_mean' : tercileMonths[index]['ult_max'].mean(), # mean of ult_max in ter 'u_min_max' : tercileMonths[index]['ult_min'].max(), # max of ult_min in ter 'u_min_mean' : tercileMonths[index]['ult_min'].mean(), # mean of ult_min in ter 'intsy_min' : tercileMonths[index]['intensity'].min(), # min of intensity in ter 'intsy_max' : tercileMonths[index]['intensity'].max(), # max of intensity in ter 'intsy_mean' : tercileMonths[index]['intensity'].mean() # mean of intensity in ter }) terCols = ['month', 'lat', 'lon', 't_ave_max', 'h_exc_min', 'h_exc_mean', 'h_exc_max', 'WC_min', 'WC_max', 'WC_mean', 'dur_min', 'dur_max', 'dur_mean', 'u_max_max', 'u_max_mean', 'u_min_max', 'u_min_mean', 'intsy_min', 'intsy_max', 'intsy_mean'] dfTMS = pd.DataFrame(terVals) # assign lat/lon, reindex (not really necessary) dfTMS = dfTMS.assign(lat = latIn, lon = lonIn) dfTMS = dfTMS.reindex(columns = terCols) dfTMS['month_ter'] = '' for i in range(1,13): # change to 13 belowLoc = int(dfTMS[dfTMS['month'] == i]['t_ave_max'].astype(float).idxmin()) # get min loc aveLoc = int(dfTMS[dfTMS['month'] == i].nsmallest(2, 't_ave_max')['t_ave_max'].idxmax()) # get mid loc aboveLoc = int(dfTMS[dfTMS['month'] == i]['t_ave_max'].astype(float).idxmax()) # get max loc # assign vals dfTMS.at[belowLoc, 'month_ter'] = (str(i) + ' below') dfTMS.at[aveLoc, 'month_ter'] = (str(i) + ' average') dfTMS.at[aboveLoc, 'month_ter'] = (str(i) + ' above') ####code for full 30 year period monthList dictionary = [] for index, frame in enumerate(thirty_year_month_List): # create df for populating stats values = {'month' : thirty_year_month_List[index].index[0].month, 'lat':latIn , 'lon':lonIn , 't_ave_max' : thirty_year_month_List[index]['tmp_ave'].max(), # max ave temp in ter 'h_exc_min' : thirty_year_month_List[index]['heat_exceeds'].min(), # min heat exceed days in ter 'h_exc_mean' : thirty_year_month_List[index]['heat_exceeds'].mean(), # mean heat exceed days in ter 'h_exc_max' : thirty_year_month_List[index]['heat_exceeds'].max(), # max heat exceed days in ter 'WC_min' : thirty_year_month_List[index]['wave_count'].min(), # min heat wave in ter 'WC_max' : thirty_year_month_List[index]['wave_count'].max(), # max heat wave in ter 'WC_mean' : thirty_year_month_List[index]['wave_count'].mean(), # mean heat wave in ter 'dur_min' : thirty_year_month_List[index]['duration_sum'].min(), # min heat wave days in ter 'dur_max' : thirty_year_month_List[index]['duration_sum'].max(), # max heat wave days in ter 'dur_mean' : thirty_year_month_List[index]['duration_sum'].mean(), # mean heat wave days in ter 'u_max_max' : thirty_year_month_List[index]['ult_max'].max(), # max of ult_max in ter 'u_max_mean' : thirty_year_month_List[index]['ult_max'].mean(), # mean of ult_max in ter 'u_min_max' : thirty_year_month_List[index]['ult_min'].max(), # max of ult_min in ter 'u_min_mean' : thirty_year_month_List[index]['ult_min'].mean(), # mean of ult_min in ter 'intsy_min' : thirty_year_month_List[index]['intensity'].min(), # min of intensity in ter 'intsy_max' : thirty_year_month_List[index]['intensity'].max(), # max of intensity in ter 'intsy_mean' : thirty_year_month_List[index]['intensity'].mean(), 'month_ter' : '30 Year'# mean of intensity in ter } dictionary.append(values) new_dict = pd.DataFrame(dictionary) #new_dict.assign(lat = latIn, lon = lonIn) final_month_file = pd.concat([dfTMS, new_dict]).sort_values(by=["month", "t_ave_max"]) #print(final_month_file) # seasonal terVals = [] # season tercile stats for index, frame in enumerate(tercileSeasons): # create df for populating stats terVals.insert(index, {'season' : tercileSeasons[index].index[0].month, 't_ave_max' : tercileSeasons[index]['tmp_ave'].max(), # max ave temp in ter 'h_exc_min' : tercileSeasons[index]['heat_exceeds'].min(), # min heat exceed days in ter 'h_exc_mean' : tercileSeasons[index]['heat_exceeds'].mean(), # mean heat exceed days in ter 'h_exc_max' : tercileSeasons[index]['heat_exceeds'].max(), # max heat exceed days in ter 'WC_min' : tercileSeasons[index]['wave_count'].min(), # min heat wave in ter 'WC_max' : tercileSeasons[index]['wave_count'].max(), # max heat wave in ter 'WC_mean' : tercileSeasons[index]['wave_count'].mean(), # mean heat wave in ter 'dur_min' : tercileSeasons[index]['duration_sum'].min(), # min heat wave days in ter 'dur_max' : tercileSeasons[index]['duration_sum'].max(), # max heat wave days in ter 'dur_mean' : tercileSeasons[index]['duration_sum'].mean(), # mean heat wave days in ter 'u_max_max' : tercileSeasons[index]['ult_max'].max(), # max of ult_max in ter 'u_max_mean' : tercileSeasons[index]['ult_max'].mean(), # mean of ult_max in ter 'u_min_max' : tercileSeasons[index]['ult_min'].max(), # max of ult_min in ter 'u_min_mean' : tercileSeasons[index]['ult_min'].mean(), # mean of ult_min in ter 'intsy_min' : tercileSeasons[index]['intensity'].min(), # min of intensity in ter 'intsy_max' : tercileSeasons[index]['intensity'].max(), # max of intensity in ter 'intsy_mean' : tercileSeasons[index]['intensity'].mean()# mean of intensity in ter }) terCols = ['season', 'lat', 'lon', 't_ave_max', 'h_exc_min', 'h_exc_mean', 'h_exc_max', 'WC_min', 'WC_max', 'WC_mean', 'dur_min', 'dur_max', 'dur_mean', 'u_max_max', 'u_max_mean', 'u_min_max', 'u_min_mean', 'intsy_min', 'intsy_max', 'intsy_mean'] dfTSS = pd.DataFrame(terVals) # assign lat/lon, reindex (not really necessary) dfTSS = dfTSS.assign(lat = latIn, lon = lonIn) dfTSS = dfTSS.reindex(columns = terCols) # assign vals for below, average, above indicator dfTSS['season_ter'] = '' for i in range(1,13): # change to 13 belowLoc = int(dfTSS[dfTSS['season'] == i]['t_ave_max'].astype(float).idxmin()) # get min loc aveLoc = int(dfTSS[dfTSS['season'] == i].nsmallest(2, 't_ave_max')['t_ave_max'].idxmax()) # get mid loc aboveLoc = int(dfTSS[dfTSS['season'] == i]['t_ave_max'].astype(float).idxmax()) # get max loc # assign vals dfTSS.at[belowLoc, 'season_ter'] = (str(i) + ' below') dfTSS.at[aveLoc, 'season_ter'] = (str(i) + ' average') dfTSS.at[aboveLoc, 'season_ter'] = (str(i) + ' above') ####code for full 30 year period monthList dictionary = [] for index, frame in enumerate(thirty_year_season_List): # create df for populating stats values = {'season' : thirty_year_season_List[index].index[0].month, 'lat':latIn , 'lon':lonIn , 't_ave_max' : thirty_year_season_List[index]['tmp_ave'].max(), # max ave temp in ter 'h_exc_min' : thirty_year_season_List[index]['heat_exceeds'].min(), # min heat exceed days in ter 'h_exc_mean' : thirty_year_season_List[index]['heat_exceeds'].mean(), # mean heat exceed days in ter 'h_exc_max' : thirty_year_season_List[index]['heat_exceeds'].max(), # max heat exceed days in ter 'WC_min' : thirty_year_season_List[index]['wave_count'].min(), # min heat wave in ter 'WC_max' : thirty_year_season_List[index]['wave_count'].max(), # max heat wave in ter 'WC_mean' : thirty_year_season_List[index]['wave_count'].mean(), # mean heat wave in ter 'dur_min' : thirty_year_season_List[index]['duration_sum'].min(), # min heat wave days in ter 'dur_max' : thirty_year_season_List[index]['duration_sum'].max(), # max heat wave days in ter 'dur_mean' : thirty_year_season_List[index]['duration_sum'].mean(), # mean heat wave days in ter 'u_max_max' : thirty_year_season_List[index]['ult_max'].max(), # max of ult_max in ter 'u_max_mean' : thirty_year_season_List[index]['ult_max'].mean(), # mean of ult_max in ter 'u_min_max' : thirty_year_season_List[index]['ult_min'].max(), # max of ult_min in ter 'u_min_mean' : thirty_year_season_List[index]['ult_min'].mean(), # mean of ult_min in ter 'intsy_min' : thirty_year_season_List[index]['intensity'].min(), # min of intensity in ter 'intsy_max' : thirty_year_season_List[index]['intensity'].max(), # max of intensity in ter 'intsy_mean' : thirty_year_season_List[index]['intensity'].mean(), 'season_ter' : '30 Year'# mean of intensity in ter } dictionary.append(values) new_dict = pd.DataFrame(dictionary) final_season_file = pd.concat([dfTSS, new_dict]).sort_values(by=["season", "t_ave_max"], ascending = True ) #print(final_season_file) ## generate outputs # as csv, using dynamic naming convention #monthly_out = month_dir + '/' + '_'.join(['monthly', str(x), str(y)]) #monthly_out += '.csv' #seasonal_out = season_dir + '/' + '_'.join(['seasonal', str(x), str(y)]) #seasonal_out += '.csv' root_out = 'E:/CHMO/Outputs' month_dir = root_out + '/Monthly' season_dir = root_out + '/Seasonal' #y = c.id pid = os.getpid() monthly_out = month_dir + '/' + 'Monthly_All.csv' seasonal_out = season_dir + '/' + 'Seasonal_All.csv' final_month_file.to_csv(monthly_out, mode = 'a', header = False, index = False) final_season_file.to_csv(seasonal_out, mode = 'a',header = False,index = False) ###push results to list to account for file locking racing #final_data_season.append(final_month_file) #final_data_month.append(final_season_file) #return new_dict #return(final_season_file) #%lprun -f test test() #test(-103.393, 30.893) #f = client.map(test,(-103.393,30.893),(-104.393,31.893) ) #for x in f: # print(x.result) #view = rc[:] # use all engines #view.apply(test, (-103.393,30.893),(-104.393,31.893)) #pool = Pool(4) #results = pool.map(test, [(-103.393,30.893),(-104.393,31.893)]) #test_coors = [[-103.393,30.893],[-104.393,31.893]] #lats = lats.tolist() #lons = lons.tolist() #print(lats) #test_coors = [lons,lats] #for x,y in array: # test(x,y) #for v in coords: #print(x.x,x.y) #print(dview['coords']) #c= dview['coords'][0][0][0] #d = dview['coords'][1][1][1] #ch = dview['coords'] ###split #print(a) #for num,x in enumerate(ch[0]): #print(x[0],x[1]) # for y in x: # print(y) #dview.apply(test,x, targets = num) ##targets #dview.map(test,x) #for x,y in ch[1]: #dview.apply(test,x,y) #print(y[0]) ###for each scatter array, execute function, pass each scatter array to function #for x in ch: #for y in x: # print(y[0], y[1]) #dview.execute('test(y[0],y[1])') #print(c,d) #for x in c: #print(y) #a = dview.apply(test,y[0],y[1]) #final = dview.gather('a', block=True) #print(a.result()) #a.gather #print(final) #c[:].apply(test,(-103.393,30.893),(-104.393,31.893)) #print('Completed processing') #end = time.time() #print(end - start) # ## Create master outputs # Monthly # In[9]: filenames = glob.glob(month_dir + "/*.csv") mergePD = pd.DataFrame() for file in filenames: ###use chunks parameter if computer has low RAM df = pd.read_csv(file) mergePD = pd.concat([mergePD, df], sort=False, ignore_index=True) month_all = root_out + '/monthly_all9.csv' mergePD.to_csv(month_all) # Seasonal # In[10]: filenames = glob.glob(season_dir + "/*.csv") mergePD = pd.DataFrame() for file in filenames: ##use chunks parameter if computer has Low RAM df = pd.read_csv(file) mergePD = pd.concat([mergePD, df], sort=False, ignore_index=True) season_all = root_out + '/seasonal_all.csv' mergePD.to_csv(season_all) # In[ ]: