import pandas as pd
def drop_on(df, bool_series, inplace=False):
rows = df.loc[bool_series].index
return df.drop(index=rows, inplace=inplace)
def rename_cols_add_prefix(df, ignore=None, prefix='_', inplace=True):
rename_f = lambda col : col if (ignore is not None and col in ignore) else prefix+col
return df.rename(columns=rename_f, inplace=inplace)
#from google.colab import drive
#drive.mount('/content/drive')
#base_path = 'drive/MyDrive/UFRPE-Pesquisas/_2022 - Previsão Chuvas/dados/'
base_path = 'dados/'
Este notebook tem os dados coletados de: http://www.cpc.ncep.noaa.gov/data/indices/
O objetivo é preprocessar e formar um dataset único com todos.
1) North Atlantic, South Atlantic, Global Tropics¶
Descrição completa no site: North Atlantic (5-20°North, 60-30°West), South Atlantic (0-20°South, 30°West-10°East), Global Tropics (10°South-10°North, 0-360):
--> Usar NATL e SATL - Temperaturas no Atlântico.
df1 = pd.read_csv(base_path + 'dados_in/1.nasa_gt.csv', delim_whitespace=True, usecols=['YR', 'MON', 'NATL', 'SATL'])
#df1
rename_YR = {'YR': 'YEAR'}
df1.rename(columns=rename_YR, inplace=True)
#drop_on(df1, df1['YEAR']>2021, inplace=True)
df1
YEAR | MON | NATL | SATL | |
---|---|---|---|---|
0 | 1982 | 1 | 25.67 | 25.26 |
1 | 1982 | 2 | 25.35 | 26.04 |
2 | 1982 | 3 | 25.37 | 26.56 |
3 | 1982 | 4 | 25.36 | 26.26 |
4 | 1982 | 5 | 26.11 | 25.34 |
... | ... | ... | ... | ... |
486 | 2022 | 7 | 27.44 | 23.88 |
487 | 2022 | 8 | 28.09 | 23.26 |
488 | 2022 | 9 | 28.56 | 23.23 |
489 | 2022 | 10 | 28.38 | 23.74 |
490 | 2022 | 11 | 27.48 | 24.37 |
491 rows × 4 columns
2) Monthly OISST.v2 (1981-2010 base period) Niño 1+2, 3, 4 e 3.4¶
Descrição completa no site: Monthly OISST.v2 (1981-2010 base period) Niño 1+2 (0-10°South)(90°West-80°West) Niño 3 (5°North-5°South)(150°West-90°West) Niño 4 (5°North-5°South) (160°East-150°West) Niño 3.4 (5°North-5°South)(170-120°West)
--> NINO1+2 NINO3 NINO 4 e NINO 3.4
df2 = pd.read_csv(base_path + 'dados_in/2.moisst_v2.csv', delim_whitespace=True, usecols=['YR', 'MON', 'NINO1+2', 'NINO3', 'NINO4', 'NINO3.4'])
df2.rename(columns=rename_YR, inplace=True)
#drop_on(df2, df2['YEAR']>2021, inplace=True)
df2
YEAR | MON | NINO1+2 | NINO3 | NINO4 | NINO3.4 | |
---|---|---|---|---|---|---|
0 | 1982 | 1 | 24.28 | 25.84 | 28.01 | 26.65 |
1 | 1982 | 2 | 25.38 | 26.26 | 27.99 | 26.54 |
2 | 1982 | 3 | 25.22 | 26.92 | 28.18 | 27.09 |
3 | 1982 | 4 | 24.57 | 27.52 | 28.61 | 27.83 |
4 | 1982 | 5 | 24.00 | 27.70 | 29.19 | 28.37 |
... | ... | ... | ... | ... | ... | ... |
486 | 2022 | 7 | 20.67 | 25.51 | 27.90 | 26.68 |
487 | 2022 | 8 | 20.29 | 24.69 | 27.76 | 25.92 |
488 | 2022 | 9 | 19.46 | 24.21 | 27.67 | 25.83 |
489 | 2022 | 10 | 19.06 | 24.17 | 27.60 | 25.92 |
490 | 2022 | 11 | 20.27 | 24.31 | 27.77 | 25.89 |
491 rows × 6 columns
3) Tahiti Sea Level Pressure (1951 - Present)¶
TAHITI SEA LEVEL PRESS (1000 MB SUBTRACTED) / ORIGINAL DATA
Explicação: Pressão a nível do mar (subtraída de 1000), no Taiti.
df3 = pd.read_csv(base_path + 'dados_in/3.tahiti_slp.csv', delim_whitespace=True)
df3 = df3.melt(id_vars=['YEAR'], var_name='MON', value_name='TahiPr')
months_to_int = dict(JAN=1, FEB=2, MAR=3, APR=4, MAY=5, JUN=6, JUL=7, AUG=8, SEP=9, OCT=10, NOV=11, DEC=12)
df3['MON'] = df3['MON'].apply(lambda x : months_to_int[x])
df3
YEAR | MON | TahiPr | |
---|---|---|---|
0 | 1951 | 1 | 12.5 |
1 | 1952 | 1 | 9.4 |
2 | 1953 | 1 | 11.8 |
3 | 1954 | 1 | 10.5 |
4 | 1955 | 1 | 11.7 |
... | ... | ... | ... |
847 | 2017 | 12 | 10.5 |
848 | 2018 | 12 | 11.5 |
849 | 2019 | 12 | 10.8 |
850 | 2020 | 12 | 13.3 |
851 | 2021 | 12 | 12.7 |
852 rows × 3 columns
4) Darwin Sea Level Pressure (1951 - Present):¶
DARWIN SEA LEVEL PRESS (1000 MB SUBTRACTED) / ORIGINAL DATA
Explicação: Pressão a nível do mar (subtraída de 1000), em Darwin.
df4 = pd.read_csv(base_path + 'dados_in/4.darwin_slp.csv', delim_whitespace=True)
df4
YEAR | JAN | FEB | MAR | APR | MAY | JUN | JUL | AUG | SEP | OCT | NOV | DEC | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1951 | 5.3 | 5.1 | 7.0 | 9.4 | 11.9 | 11.2 | 13.3 | 13.2 | 12.5 | 11.4 | 9.5 | 8.9 |
1 | 1952 | 6.7 | 7.1 | 8.3 | 10.3 | 10.3 | 12.5 | 13.0 | 12.6 | 12.1 | 9.9 | 7.7 | 8.4 |
2 | 1953 | 6.9 | 7.3 | 8.0 | 8.9 | 13.1 | 12.5 | 13.9 | 13.0 | 13.2 | 10.8 | 9.4 | 8.1 |
3 | 1954 | 4.9 | 7.3 | 7.7 | 7.7 | 10.1 | 12.3 | 12.4 | 11.8 | 11.9 | 10.7 | 9.1 | 6.5 |
4 | 1955 | 8.3 | 3.7 | 6.7 | 8.9 | 9.4 | 10.9 | 11.7 | 11.4 | 10.9 | 9.3 | 7.6 | 7.6 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
66 | 2017 | 6.0 | 7.1 | 7.4 | 9.7 | 11.7 | 13.5 | 13.2 | 13.0 | 12.3 | 9.4 | 7.4 | 7.3 |
67 | 2018 | 4.8 | 7.0 | 6.1 | 9.5 | 12.2 | 13.7 | 12.8 | 13.1 | 12.3 | 11.0 | 9.4 | 6.2 |
68 | 2019 | 6.8 | 9.0 | 8.4 | 10.1 | 12.3 | 13.8 | 14.2 | 14.3 | 14.5 | 11.4 | 9.7 | 8.4 |
69 | 2020 | 7.2 | 7.4 | 9.1 | 10.5 | 12.4 | 12.9 | 12.7 | 12.5 | 11.6 | 10.0 | 9.0 | 6.5 |
70 | 2021 | 5.6 | 5.0 | 8.1 | 9.7 | 11.3 | 13.1 | 12.3 | 13.3 | 12.0 | 9.7 | 8.3 | 6.5 |
71 rows × 13 columns
df4 = df4.melt(id_vars=['YEAR'], var_name='MON', value_name='DarwPr')
df4['MON'] = df4['MON'].apply(lambda x : months_to_int[x])
df4
YEAR | MON | DarwPr | |
---|---|---|---|
0 | 1951 | 1 | 5.3 |
1 | 1952 | 1 | 6.7 |
2 | 1953 | 1 | 6.9 |
3 | 1954 | 1 | 4.9 |
4 | 1955 | 1 | 8.3 |
... | ... | ... | ... |
847 | 2017 | 12 | 7.3 |
848 | 2018 | 12 | 6.2 |
849 | 2019 | 12 | 8.4 |
850 | 2020 | 12 | 6.5 |
851 | 2021 | 12 | 6.5 |
852 rows × 3 columns
5*) (STAND TAHITI - STAND DARWIN) SEA LEVEL PRESS¶
STANDARDIZED DATA
--> Não usar para previsão. Descartado!
#df5 = pd.read_csv(base_path + 'dados_in/5.stsd_slp.csv', delim_whitespace=True, skiprows=2)
#df5
#df5 = df5.melt(id_vars=['YEAR'], var_name='MON', value_name='STSD_SLP')
#df5['MON'] = df5['MON'].apply(lambda x : months_to_int[x])
#df5
#df5[df5['MONTH']=='JAN'][['YEAR', 'STSD_SLP']].plot.line(x='YEAR', y='STSD_SLP', figsize=(10,6))
#plt.figure(figsize=(10,6))
#sns.regplot(data=df5, x='YEAR', y='STSD_SLP', scatter=False)
#sns.regplot(data=df5, x='YEAR', y='STSD_SLP', scatter_kws=dict(s=15))
#sns.regplot(data=df5, x='YEAR', y='STSD_SLP')
6) 850 MB TRADE WIND INDEX(135E-180W)5N 5S WEST PACIFIC¶
ORIGINAL DATA
df6 = pd.read_csv(base_path + 'dados_in/6.wpac850.csv', delim_whitespace=True, skiprows=2)
#df6
df6 = df6.melt(id_vars=['YEAR'], var_name='MON', value_name='TW_WP')
df6['MON'] = df6['MON'].apply(lambda x : months_to_int[x])
df6
YEAR | MON | TW_WP | |
---|---|---|---|
0 | 1979 | 1 | -1.6 |
1 | 1980 | 1 | 0.3 |
2 | 1981 | 1 | 1.1 |
3 | 1982 | 1 | 1.7 |
4 | 1983 | 1 | 0.1 |
... | ... | ... | ... |
511 | 2017 | 12 | 1.3 |
512 | 2018 | 12 | 0.6 |
513 | 2019 | 12 | -1.0 |
514 | 2020 | 12 | 4.5 |
515 | 2021 | 12 | 0.7 |
516 rows × 3 columns
7) 850 MB TRADE WIND INDEX(175W-140W)5N 5S CENTRAL PACIFIC¶
ORIGINAL DATA
df7 = pd.read_csv(base_path + 'dados_in/7.cpac850.csv', delim_whitespace=True, skiprows=2)
#df7
df7 = df7.melt(id_vars=['YEAR'], var_name='MON', value_name='TW_CP')
df7['MON'] = df7['MON'].apply(lambda x : months_to_int[x])
df7
YEAR | MON | TW_CP | |
---|---|---|---|
0 | 1979 | 1 | 6.3 |
1 | 1980 | 1 | 6.9 |
2 | 1981 | 1 | 11.2 |
3 | 1982 | 1 | 10.1 |
4 | 1983 | 1 | 1.9 |
... | ... | ... | ... |
511 | 2017 | 12 | 9.2 |
512 | 2018 | 12 | 9.1 |
513 | 2019 | 12 | 7.6 |
514 | 2020 | 12 | 12.7 |
515 | 2021 | 12 | 12.6 |
516 rows × 3 columns
8) 850 MB TRADE WIND INDEX(135W-120W)5N 5S EAST PACIFIC¶
ORIGINAL DATA
df8 = pd.read_csv(base_path + 'dados_in/8.epac850.csv', delim_whitespace=True, skiprows=2)
#df8
df8 = df8.melt(id_vars=['YEAR'], var_name='MON', value_name='TW_EP')
df8['MON'] = df8['MON'].apply(lambda x : months_to_int[x])
df8
YEAR | MON | TW_EP | |
---|---|---|---|
0 | 1979 | 1 | 11.2 |
1 | 1980 | 1 | 10.1 |
2 | 1981 | 1 | 13.1 |
3 | 1982 | 1 | 11.6 |
4 | 1983 | 1 | 6.0 |
... | ... | ... | ... |
511 | 2017 | 12 | 9.9 |
512 | 2018 | 12 | 9.1 |
513 | 2019 | 12 | 8.0 |
514 | 2020 | 12 | 10.0 |
515 | 2021 | 12 | 12.4 |
516 rows × 3 columns
MERGE FINAL¶
all_dfs = [df1, df2, df3, df4, None, df6, df7, df8]
for i, df in enumerate(all_dfs):
if df is not None:
rename_cols_add_prefix(df, ignore=['YEAR', 'MON'], prefix=str(i+1)+'_', inplace=True)
df_full = df1
for df in all_dfs[1:]:
if df is not None:
df_full = df_full.merge(df, on=['YEAR', 'MON'], how='inner')
df_full
YEAR | MON | 1_NATL | 1_SATL | 2_NINO1+2 | 2_NINO3 | 2_NINO4 | 2_NINO3.4 | 3_TahiPr | 4_DarwPr | 6_TW_WP | 7_TW_CP | 8_TW_EP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1982 | 1 | 25.67 | 25.26 | 24.28 | 25.84 | 28.01 | 26.65 | 12.9 | 6.3 | 1.7 | 10.1 | 11.6 |
1 | 1982 | 2 | 25.35 | 26.04 | 25.38 | 26.26 | 27.99 | 26.54 | 11.7 | 7.0 | 0.8 | 8.3 | 11.7 |
2 | 1982 | 3 | 25.37 | 26.56 | 25.22 | 26.92 | 28.18 | 27.09 | 11.8 | 7.5 | -0.7 | 7.5 | 9.8 |
3 | 1982 | 4 | 25.36 | 26.26 | 24.57 | 27.52 | 28.61 | 27.83 | 12.2 | 10.1 | 0.8 | 6.1 | 7.0 |
4 | 1982 | 5 | 26.11 | 25.34 | 24.00 | 27.70 | 29.19 | 28.37 | 12.5 | 11.9 | 1.1 | 5.5 | 8.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
475 | 2021 | 8 | 27.97 | 24.01 | 20.84 | 25.02 | 28.62 | 26.62 | 15.7 | 13.3 | 5.3 | 9.1 | 9.0 |
476 | 2021 | 9 | 28.57 | 23.67 | 20.76 | 24.84 | 28.25 | 26.49 | 15.9 | 12.0 | 7.3 | 11.7 | 10.6 |
477 | 2021 | 10 | 28.29 | 24.06 | 20.58 | 24.53 | 28.11 | 26.00 | 13.7 | 9.7 | 4.2 | 9.6 | 8.5 |
478 | 2021 | 11 | 27.90 | 24.53 | 20.81 | 24.43 | 27.99 | 25.97 | 13.1 | 8.3 | 5.9 | 10.9 | 9.8 |
479 | 2021 | 12 | 27.05 | 24.88 | 21.31 | 24.06 | 27.68 | 25.60 | 12.7 | 6.5 | 0.7 | 12.6 | 12.4 |
480 rows × 13 columns
#df_full.to_csv(base_path + 'raw_input.csv')
Preparação dos Dados de Saída¶
A partir da planilha editada manualmente...
df = pd.read_csv(base_path + 'dados_out/chuvas_merge.csv')
#df
df
Medicao | JP_m | REC_m | MAC_m | JP_a | REC_a | MAC_a | JP | REC | MAC | PRECIP | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 31/01/1970 | 5.5 | 102.2 | 46.4 | NaN | NaN | NaN | 5.50 | 102.2 | 46.4 | 51.4 |
1 | 28/02/1970 | 71.6 | 113.3 | 72.0 | NaN | NaN | NaN | 71.60 | 113.3 | 72.0 | 85.6 |
2 | 31/03/1970 | 385.7 | 269.4 | 372.4 | NaN | NaN | NaN | 385.70 | 269.4 | 372.4 | 342.5 |
3 | 30/04/1970 | 517.9 | 591.3 | 330.6 | NaN | NaN | NaN | 517.90 | 591.3 | 330.6 | 479.9 |
4 | 31/05/1970 | 199.9 | 186.7 | NaN | NaN | NaN | NaN | 199.90 | 186.7 | NaN | 193.3 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
632 | 30/09/2022 | 47.8 | NaN | NaN | 46.6 | NaN | 79.2 | 47.20 | NaN | 79.2 | 63.2 |
633 | 31/10/2022 | 25.3 | NaN | NaN | 24.2 | NaN | 88.6 | 24.75 | NaN | 88.6 | 56.7 |
634 | 30/11/2022 | 83.4 | NaN | NaN | NaN | NaN | 194.0 | 83.40 | NaN | 194.0 | 138.7 |
635 | 31/12/2022 | 28.4 | NaN | NaN | 34.4 | NaN | 88.2 | 31.40 | NaN | 88.2 | 59.8 |
636 | 31/01/2023 | 120.5 | NaN | NaN | NaN | NaN | 64.0 | 120.50 | NaN | 64.0 | 92.3 |
637 rows × 11 columns
df['Medicao'] = pd.to_datetime(df['Medicao'])
# create new columns for year and month
df['YEAR'] = df['Medicao'].dt.year
df['MON'] = df['Medicao'].dt.month
# reindex columns to put 'YEAR' and 'MON' as first two columns
df = df.reindex(columns=['YEAR', 'MON'] + list(df.columns[:-2])) \
.drop(columns='Medicao')
df
C:\Users\pablo\AppData\Local\Temp\ipykernel_6080\770510672.py:1: UserWarning: Parsing dates in %d/%m/%Y format when dayfirst=False (the default) was specified. Pass `dayfirst=True` or specify a format to silence this warning. df['Medicao'] = pd.to_datetime(df['Medicao'])
YEAR | MON | JP_m | REC_m | MAC_m | JP_a | REC_a | MAC_a | JP | REC | MAC | PRECIP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1970 | 1 | 5.5 | 102.2 | 46.4 | NaN | NaN | NaN | 5.50 | 102.2 | 46.4 | 51.4 |
1 | 1970 | 2 | 71.6 | 113.3 | 72.0 | NaN | NaN | NaN | 71.60 | 113.3 | 72.0 | 85.6 |
2 | 1970 | 3 | 385.7 | 269.4 | 372.4 | NaN | NaN | NaN | 385.70 | 269.4 | 372.4 | 342.5 |
3 | 1970 | 4 | 517.9 | 591.3 | 330.6 | NaN | NaN | NaN | 517.90 | 591.3 | 330.6 | 479.9 |
4 | 1970 | 5 | 199.9 | 186.7 | NaN | NaN | NaN | NaN | 199.90 | 186.7 | NaN | 193.3 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
632 | 2022 | 9 | 47.8 | NaN | NaN | 46.6 | NaN | 79.2 | 47.20 | NaN | 79.2 | 63.2 |
633 | 2022 | 10 | 25.3 | NaN | NaN | 24.2 | NaN | 88.6 | 24.75 | NaN | 88.6 | 56.7 |
634 | 2022 | 11 | 83.4 | NaN | NaN | NaN | NaN | 194.0 | 83.40 | NaN | 194.0 | 138.7 |
635 | 2022 | 12 | 28.4 | NaN | NaN | 34.4 | NaN | 88.2 | 31.40 | NaN | 88.2 | 59.8 |
636 | 2023 | 1 | 120.5 | NaN | NaN | NaN | NaN | 64.0 | 120.50 | NaN | 64.0 | 92.3 |
637 rows × 12 columns
df['PRECIP'].isna().sum()
0
df_y = df[['YEAR', 'MON', 'PRECIP']].query('YEAR >= 1982 and YEAR < 2022')
#df_y.to_csv(base_path + 'raw_output.csv', index=False)
df_y
YEAR | MON | PRECIP | |
---|---|---|---|
144 | 1982 | 1 | 106.4 |
145 | 1982 | 2 | 163.5 |
146 | 1982 | 3 | 84.5 |
147 | 1982 | 4 | 329.8 |
148 | 1982 | 5 | 389.0 |
... | ... | ... | ... |
619 | 2021 | 8 | 283.4 |
620 | 2021 | 9 | 56.2 |
621 | 2021 | 10 | 35.5 |
622 | 2021 | 11 | 39.0 |
623 | 2021 | 12 | 38.4 |
480 rows × 3 columns