from pylab import *
import pandas as pd
import pandas.tools.plotting as pt
from netCDF4 import Dataset
import datetime as dt
import dateutil as dtu
from sklearn import ensemble as ens
from sklearn import linear_model as lin
from sklearn.pipeline import Pipeline
from sklearn import preprocessing as prep
from pandas.stats import moments as mm
nino34 = pd.read_csv("http://www.cpc.ncep.noaa.gov/products/analysis_monitoring/ensostuff/detrend.nino34.ascii.txt", sep="\s+")
link = pd.read_csv("https://raw.githubusercontent.com/johncarlosbaez/el-nino/master/R/average-link-strength.txt", header=None)
link.shape
url="http://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/ncep.reanalysis.derived/surface/pres.mon.mean.nc"
nc=Dataset(url)
#dir(nc)
vars=nc.variables
t = vars['time']
day0 = dt.date(1800,1,1)
days=amap(lambda x: day0 + dt.timedelta(hours=x), t[:])
air0=vars["pres"][:,:,:]
print air0.shape
#print days
win=6
lag = 6
X, Y, Z = air0.shape
a, b, c= air0.strides
mask=[days[win-1:] >= dt.date(1950,01,01) - dtu.relativedelta.relativedelta(months=lag)]
air = np.lib.stride_tricks.as_strided(air0, (X-win+1, Y*Z*win), (a, c))[mask][:nino34.shape[0]]
print air.shape
xt = ens.ExtraTreesRegressor(n_estimators=2000, max_features=0.2, max_depth=None, min_samples_leaf=1,
oob_score=False, bootstrap=False, n_jobs=3, verbose=0)
sgd = Pipeline([
("scale", prep.StandardScaler()),
("norm", prep.Normalizer()),
("sgd",lin.SGDRegressor(loss="epsilon_insensitive", penalty='elasticnet', n_iter=1000, alpha= (2**-20), epsilon=1,
l1_ratio=0.9,
#learning_rate="optimal",
shuffle=True, verbose=3)),
])
model=xt
train = air[0:400]
print train.shape
model.fit(X=train, y=nino34.iloc[0:400,-1])
print model.score(X=train, y=nino34.iloc[0:400,-1]), model.score(air[400:788,:], nino34.iloc[400:788, -1])
test = pd.DataFrame({"true": nino34.iloc[400:788, -1], "predicted": 2*model.predict(air[400:788,:])})
test.plot("true", "predicted", kind="scatter", alpha=0.2, marker=".", figsize=(7,7))
test.plot(alpha=0.2, marker=".", figsize=(20,3))
figure(figsize=(20,3))
zz=xcorr(test.predicted, test.true, maxlags=None)
figure(figsize=(20,3))
zz=acorr(test.true, maxlags=None)
figure(figsize=(20,3))
zz=acorr(test.predicted, maxlags=None)
figure(figsize=(20,3))
zz=cohere(test.true, test.predicted, NFFT=64, noverlap=32)
future=pd.DataFrame({"future": model.predict(air[-6:,:])})
future.plot()