In [13]:
from pylab import *
import pandas as pd
import pandas.tools.plotting as pt
from netCDF4 import Dataset
import datetime as dt
import dateutil as dtu
from sklearn import ensemble as ens
from sklearn import linear_model as lin
from sklearn.pipeline import Pipeline
from sklearn import preprocessing as prep
from pandas.stats import moments as mm
In [2]:
nino34 = pd.read_csv("http://www.cpc.ncep.noaa.gov/products/analysis_monitoring/ensostuff/detrend.nino34.ascii.txt", sep="\s+")
In [3]:
link = pd.read_csv("https://raw.githubusercontent.com/johncarlosbaez/el-nino/master/R/average-link-strength.txt", header=None)
link.shape
Out[3]:
(1132, 1)
In [4]:
url="http://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/ncep.reanalysis.derived/surface/pres.mon.mean.nc"

nc=Dataset(url)
#dir(nc)
In [5]:
vars=nc.variables
t = vars['time']
day0 = dt.date(1800,1,1)
days=amap(lambda x: day0 + dt.timedelta(hours=x), t[:])
In [6]:
air0=vars["pres"][:,:,:]
print air0.shape
(802, 73, 144)

In [83]:
#print days
win=6
lag = 6
X, Y, Z = air0.shape
a, b, c= air0.strides
mask=[days[win-1:] >= dt.date(1950,01,01) - dtu.relativedelta.relativedelta(months=lag)]
air = np.lib.stride_tricks.as_strided(air0, (X-win+1, Y*Z*win), (a, c))[mask][:nino34.shape[0]]

print air.shape
(778, 63072)

In [84]:
xt = ens.ExtraTreesRegressor(n_estimators=2000, max_features=0.2, max_depth=None, min_samples_leaf=1,
                         oob_score=False, bootstrap=False, n_jobs=3, verbose=0)

sgd =  Pipeline([
    ("scale", prep.StandardScaler()),
    ("norm", prep.Normalizer()),
    ("sgd",lin.SGDRegressor(loss="epsilon_insensitive", penalty='elasticnet', n_iter=1000, alpha= (2**-20), epsilon=1, 
                            l1_ratio=0.9,
                        #learning_rate="optimal",
                    shuffle=True, verbose=3)),
    ])
    
model=xt

train = air[0:400]
print train.shape
model.fit(X=train, y=nino34.iloc[0:400,-1])
(400, 63072)

Out[84]:
ExtraTreesRegressor(bootstrap=False, compute_importances=None,
          criterion='mse', max_depth=None, max_features=0.2,
          max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
          min_samples_split=2, n_estimators=2000, n_jobs=3,
          oob_score=False, random_state=None, verbose=0)
In [85]:
print model.score(X=train, y=nino34.iloc[0:400,-1]), model.score(air[400:788,:], nino34.iloc[400:788, -1])
1.0 0.370821047578

In [86]:
test = pd.DataFrame({"true": nino34.iloc[400:788, -1], "predicted": 2*model.predict(air[400:788,:])})
test.plot("true", "predicted", kind="scatter", alpha=0.2, marker=".", figsize=(7,7))
Out[86]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fc0f39db550>
In [87]:
test.plot(alpha=0.2, marker=".", figsize=(20,3))
Out[87]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fc0f74c50d0>
In [88]:
figure(figsize=(20,3))
zz=xcorr(test.predicted, test.true, maxlags=None)
In [89]:
figure(figsize=(20,3))
zz=acorr(test.true, maxlags=None)
In [90]:
figure(figsize=(20,3))
zz=acorr(test.predicted, maxlags=None)
In [96]:
figure(figsize=(20,3))
zz=cohere(test.true, test.predicted, NFFT=64, noverlap=32)
In [91]:
future=pd.DataFrame({"future": model.predict(air[-6:,:])})
In [92]:
future.plot()
Out[92]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fc0f743aa90>