import pandas as pd
import talib as tl
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.svm import LinearSVC, SVC
from sklearn.grid_search import GridSearchCV

from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score


##sma
def SMA(data, ndays,tag): 
 SMA = pd.Series(pd.rolling_mean(data['close'], ndays), name = tag) 
 data[tag] = SMA
 return data
# SMA(df,5)

##bbands
def BBANDS(data, ndays):

 MA = pd.Series(pd.rolling_mean(data['close'], ndays)) 
 SD = pd.Series(pd.rolling_std(data['close'], ndays))
 b1 = MA + (2 * SD)
 B1 = pd.Series(b1, name = 'Upper BollingerBand') 
 data['ub'] = B1
 b2 = MA - (2 * SD)
 B2 = pd.Series(b2, name = 'Lower BollingerBand') 
#  data = data.join(B2) 
 data['lb'] = B2
 return data
# BBANDS(df,50)

##cci
def CCI(data, ndays): 
 TP = (data['high'] + data['low'] + data['close']) / 3 
 CCI = pd.Series((TP - pd.rolling_mean(TP, ndays)) / (0.015 * pd.rolling_std(TP, ndays)),name = 'CCI') 
#  data = data.join(CCI) 
 data['CCI'] = CCI   
 return data
# CCI(df,20)

##roc
def ROC(data,n):
 N = data['close'].diff(n)
 D = data['close'].shift(n)
 ROC = pd.Series(N/D,name='Rate of Change')
 data['ROC'] = ROC
#  data = data.join(ROC)
 return data
# ROC(df,5)

# Ease of Movement 
def EVM(data, ndays): 
 dm = ((data['high'] + data['low'])/2) - ((data['high'].shift(1) + data['low'].shift(1))/2)
 br = (data['volume'] / 100000000) / ((data['high'] - data['low']))
 EVM = dm / br 
 EVM_MA = pd.Series(pd.rolling_mean(EVM, ndays), name = 'EVM')
 data['EVM']  =  EVM
#  data = data.join(EVM_MA) 
 return data
# EVM(df,14)

# Force Index 
def ForceIndex(data, ndays): 
 FI = pd.Series(data['close'].diff(ndays) * data['volume'], name = 'ForceIndex') 
 data['FI'] = FI
#  data = data.join(FI) 
 return data
# ForceIndex(df,1)
##macd
def MACD(data,short=0,long1=0,mid=0):
    if short==0:
        short=12
    if long1==0:
        long1=26
    if mid==0:
        mid=9
    data['sema']=pd.ewma(data['close'],span=short)
    data['lema']=pd.ewma(data['close'],span=long1)
    data.fillna(0,inplace=True)
    data['macd_dif']=data['sema']-data['lema']
    data['macd_dea']=pd.ewma(data['macd_dif'],span=mid)
#     data['macd']=2*(data['macd_dif']-data['macd_dea'])
    data.fillna(0,inplace=True)
    return data

# MACD(df,0,0,0)

def SMA_CN(close, timeperiod) :
    close = np.nan_to_num(close)
    return reduce(lambda x, y: ((timeperiod - 1) * x + y) / timeperiod, close)

# 同花顺和通达信等软件中的RSI
def RSI_CN(data, timeperiod) :
    close = np.array(data['close'])
    diff = map(lambda x, y : x - y, close[1:], close[:-1])
    diffGt0 = map(lambda x : 0 if x < 0 else x, diff)
    diffABS = map(lambda x : abs(x), diff)
    diff = np.array(diff)
    diffGt0 = np.array(diffGt0)
    diffABS = np.array(diffABS)
    diff = np.append(diff[0], diff)
    diffGt0 = np.append(diffGt0[0], diffGt0)
    diffABS = np.append(diffABS[0], diffABS)
    rsi = map(lambda x : SMA_CN(diffGt0[:x], timeperiod) / SMA_CN(diffABS[:x], timeperiod) * 100
            , range(1, len(diffGt0) + 1) )
    data['RSI'] = rsi
    return data
#RSI_CN(df,14)

##ATR指标主要是用来衡量市场波动的强烈度
def ATR(data,timeperiod):
    close_ATR = np.array(data['close'])
    high_ATR = np.array(data['high'])
    low_ATR = np.array(data['low'])
    atr = tl.ATR(high_ATR, low_ATR, close_ATR, timeperiod)
    data['ATR'] = atr
    return data
# ATR(df,14)

def OBV(data):
    obv = tl.OBV(np.array(data['close']),np.array(data['volume']))
    data['OBV'] = obv
    return data
# OBV(df)

def MOM(data):
    mom = tl.MOM(np.array(data['close']), timeperiod=5)
    data['MOM'] = mom
    return data
# MOM(df)
    

def get_tech_data(df):
    data = df.copy()
    SMA(data,5,'sma_5')
    SMA(data,10,'sma_10')
    SMA(data,20,'sma_20')
    SMA(data,30,'sma_30')
    SMA(data,60,'sma_60')
    BBANDS(data,50)
    MACD(data,0,0,0)
    RSI_CN(data,6)
    CCI(data,20)
    ROC(data,5)
    EVM(data,14)
    ForceIndex(data,1)
    ATR(data,14)
    OBV(data)
    MOM(data)
#     data.drop(columns=['open', 'high','close','low','volume','money'])
    data = data.drop('open', 1)
    data = data.drop('high', 1)
    data = data.drop('close', 1)
    data = data.drop('low', 1)
    data = data.drop('volume', 1)
    data = data.drop('money', 1)
    data = data.drop('sema', 1)
    data = data.drop('lema', 1)

    return data
df = get_price('000300.XSHG', end_date='2019-02-17', frequency='daily', fields=['open','high','close','low', 'volume','money']) 
tech_data = get_tech_data(df)
tech_data.tail(10)

comm_data = pd.DataFrame(index = df.index)
comm_data.head()

for c in ['open','high','low','volume']:
    for p in [1,2,3]:
       comm_data[c+"diff"+str(p)]=(df[c] - df[c].shift(p)) / df[c].shift(p)
    
comm_data.tail()

##窗口差异
ml_datas = pd.DataFrame(index = df.index)
for w in [5,10,20,30,60]:
    for c in comm_data.columns:
        ml_datas[c+"_win_"+str(w)] = comm_data[c] / (pd.Series(comm_data[c]).rolling(window=w,center=False).max() - comm_data[c].rolling(window=w,center=False).min())
        
# ml_datas.tail(10)   

##构建机器学习数据集
ml_datas = ml_datas.join(tech_data)
##关键一步，将数据左移1天
ml_datas = ml_datas.shift(1)
##明天的收盘价
ml_datas['reg_target'] = df['close']
##明天相比当天的涨跌
ml_datas['clf_target'] = (df['close']/df['close'].shift(1)) - 1 > 0
ml_datas.tail(10)
ml_datas[['sma_10','reg_target','clf_target']].tail(10)

ml_datas = ml_datas.dropna()
ml_datas.describe()


X_ori = ml_datas.drop(['reg_target','clf_target','OBV'],axis = 1)
X_ori.describe()

##当天的收盘价相比昨天的收盘价上涨还是下跌
y = ml_datas['clf_target']
y.describe()

scaler = preprocessing.StandardScaler().fit(X_ori)
X = scaler.transform(X_ori)
X[:10,:]

##Build a forerest and compute feature importance
forest = ExtraTreesClassifier(n_estimators = 250,random_state = 0)
forest.fit(X,y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],axis= 0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")

for f in range(len(indices)):
    print("%d. feature %s (%f)" % (f+1,X_ori.columns[indices[f]],importances[indices[f]]))
    
indices = indices[:20]

plt.figure(figsize=(16,9))
plt.title("feature importance")
plt.bar(range(len(indices)),importances[indices],color='r',yerr=std[indices],align='center')
plt.xticks(range(len(indices)),indices)
plt.xlim([-1,len(indices)])
plt.show()

###对沪深300指数进行预测分类
start = '2017-01-01'
X_train = X_ori[X_ori.index<start]
X_test = X_ori[X_ori.index>=start]
y_train = y[y.index<start]
y_test = y[y.index>=start]
print X_train.shape,y_train.shape,X_test.shape,y_test.shape

##基线
# model = LinearRegression()
# model.fit(X_train,y_train)
# y_pred = model.predict(X_test)
#注意结果是bool值
# score = r2_score(y_pred,y_test)
# print 'LinearRegression Score:',confusion_matrix(y_pred, y_test)

models = [("LR", LogisticRegression()), 
              ("LDA", LDA()), 
              ("QDA", QDA()),
              ("LSVC", LinearSVC()),
              ("RSVM", SVC(
              	C=1000000.0, cache_size=200, class_weight=None,
                coef0=0.0, degree=3, gamma=0.0001, kernel='rbf',
                max_iter=-1, probability=False, random_state=None,
                shrinking=True, tol=0.001, verbose=False)
              ),
              ("RF", RandomForestClassifier(
              	n_estimators=1000, criterion='gini', 
                max_depth=None, min_samples_split=2, 
                min_samples_leaf=1, max_features='auto', 
                bootstrap=True, oob_score=False, n_jobs=1, 
                random_state=None, verbose=0)
              )]

    # Iterate through the models
for m in models:

    # Train each of the models on the training set
    m[1].fit(X_train, y_train)

    # Make an array of predictions on the test set
    pred = m[1].predict(X_test)

    # Output the hit-rate and the confusion matrix for each model
    print("%s:\n%0.3f" % (m[0], m[1].score(X_test, y_test)))
    print("%s\n" % confusion_matrix(pred, y_test))
#     print("%s\n" % r2_score(y_test,pred))

    
##网格搜索+交叉验证
# tuned_parameters = [
#     {'n_estimators': [500,1000],'min_samples_split':[5],'min_samples_leaf':[1]}
# ]
# model = GridSearchCV(RandomForestClassifier(),tuned_parameters,cv=10)
# model.fit(X_train, y_train)

# print("Optimised parameters found on training set:")
# print(model.best_estimator_, "\n")

# print("Grid scores calculated on training set:")
# for params, mean_score, scores in model.grid_scores_:
#     print("%0.3f for %r" % (mean_score, params))
    
##回归
X_ori = ml_datas.drop(['reg_target','clf_target','OBV'],axis=1)
y = ml_datas['reg_target']
start = '2017-01-01'
X_train = X_ori[X_ori.index<start]
X_test = X_ori[X_ori.index>=start]
y_train = y[y.index<start]
y_test = y[y.index>=start]
tuned_parameters = [
    {'alpha':[1,0.5,0.1,0.01,0.001]}
]
##岭回归
model = GridSearchCV(Ridge(),tuned_parameters,cv=10)
model.fit(X_train, y_train)

print("Optimised parameters found on training set:")
print(model.best_estimator_, "\n")

print("Grid scores calculated on training set:")
for params, mean_score, scores in model.grid_scores_:
    print("%0.3f for %r" % (mean_score, params))
    
y_pred = model.predict(X_test)
print("tomorow close is %s,current date is %s" % (y_pred[-1],X_test.index[-1]))
print("R2_sore",r2_score(y_test,y_pred))

df_result = pd.DataFrame(index = y_test.index)

df_result['True value'] = y_test
df_result['Pred value'] = y_pred
df_result.plot(figsize=(16,9))

/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:22: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=5,center=False).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:22: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=10,center=False).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:22: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=20,center=False).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:22: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=30,center=False).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:22: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=60,center=False).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:30: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=50,center=False).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:31: FutureWarning: pd.rolling_std is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=50,center=False).std()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:87: FutureWarning: pd.ewm_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.ewm(ignore_na=False,span=12,min_periods=0,adjust=True).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:88: FutureWarning: pd.ewm_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.ewm(ignore_na=False,span=26,min_periods=0,adjust=True).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:91: FutureWarning: pd.ewm_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.ewm(ignore_na=False,span=9,min_periods=0,adjust=True).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:45: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=20,center=False).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:45: FutureWarning: pd.rolling_std is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=20,center=False).std()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:66: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=14,center=False).mean()

 Feature ranking:
1. feature volumediff3_win_60 (0.015575)
2. feature volumediff3_win_30 (0.015485)
3. feature CCI (0.015274)
4. feature RSI (0.015049)
5. feature volumediff3_win_20 (0.014921)
6. feature volumediff1_win_5 (0.014855)
7. feature opendiff1_win_5 (0.014708)
8. feature ROC (0.014554)
9. feature volumediff2_win_30 (0.014312)
10. feature volumediff3_win_10 (0.014244)
11. feature MOM (0.014226)
12. feature volumediff3_win_5 (0.014218)
13. feature lowdiff1_win_10 (0.014178)
14. feature volumediff2_win_10 (0.014170)
15. feature volumediff2_win_5 (0.014016)
16. feature highdiff1_win_20 (0.013877)
17. feature lowdiff2_win_5 (0.013800)
18. feature volumediff1_win_60 (0.013672)
19. feature opendiff3_win_30 (0.013641)
20. feature volumediff1_win_30 (0.013517)
21. feature highdiff3_win_10 (0.013482)
22. feature volumediff1_win_20 (0.013421)
23. feature lowdiff1_win_20 (0.013404)
24. feature sma_30 (0.013394)
25. feature volumediff2_win_20 (0.013388)
26. feature macd_dea (0.013337)
27. feature volumediff2_win_60 (0.013328)
28. feature lowdiff3_win_5 (0.013268)
29. feature volumediff1_win_10 (0.013261)
30. feature lowdiff1_win_5 (0.013232)
31. feature opendiff2_win_10 (0.013123)
32. feature opendiff1_win_20 (0.013101)
33. feature highdiff1_win_5 (0.013055)
34. feature opendiff1_win_30 (0.013052)
35. feature highdiff3_win_60 (0.013029)
36. feature highdiff2_win_20 (0.013020)
37. feature EVM (0.013013)
38. feature highdiff1_win_60 (0.012998)
39. feature opendiff2_win_5 (0.012990)
40. feature highdiff3_win_20 (0.012986)
41. feature highdiff3_win_5 (0.012982)
42. feature highdiff1_win_30 (0.012960)
43. feature opendiff1_win_60 (0.012959)
44. feature highdiff3_win_30 (0.012924)
45. feature lowdiff2_win_60 (0.012850)
46. feature opendiff3_win_20 (0.012828)
47. feature highdiff1_win_10 (0.012821)
48. feature lowdiff3_win_10 (0.012809)
49. feature highdiff2_win_10 (0.012791)
50. feature opendiff3_win_5 (0.012779)
51. feature ub (0.012749)
52. feature lowdiff3_win_60 (0.012702)
53. feature sma_60 (0.012695)
54. feature lowdiff3_win_20 (0.012667)
55. feature lowdiff3_win_30 (0.012639)
56. feature sma_20 (0.012623)
57. feature sma_5 (0.012620)
58. feature opendiff1_win_10 (0.012589)
59. feature FI (0.012546)
60. feature lb (0.012427)
61. feature opendiff2_win_20 (0.012389)
62. feature highdiff2_win_60 (0.012388)
63. feature lowdiff2_win_20 (0.012380)
64. feature highdiff2_win_5 (0.012362)
65. feature highdiff2_win_30 (0.012330)
66. feature lowdiff2_win_10 (0.012264)
67. feature opendiff3_win_60 (0.012256)
68. feature lowdiff1_win_30 (0.012184)
69. feature macd_dif (0.012112)
70. feature sma_10 (0.012108)
71. feature ATR (0.012101)
72. feature opendiff2_win_30 (0.011868)
73. feature lowdiff2_win_30 (0.011766)
74. feature opendiff2_win_60 (0.011731)
75. feature lowdiff1_win_60 (0.011581)
76. feature opendiff3_win_10 (0.011045)

(425, 76) (425,) (515, 76) (515,)
LR:
0.517
[[122 124]
 [125 144]]

LDA:
0.476
[[159 182]
 [ 88  86]]

QDA:
0.485
[[100 118]
 [147 150]]

LSVC:
0.483
[[125 144]
 [122 124]]

RSVM:
0.520
[[  0   0]
 [247 268]]

RF:
0.534
[[117 110]
 [130 158]]

Optimised parameters found on training set:
(Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001), '\n')
Grid scores calculated on training set:
0.648 for {'alpha': 1}
0.628 for {'alpha': 0.5}
0.571 for {'alpha': 0.1}
0.485 for {'alpha': 0.01}
0.445 for {'alpha': 0.001}
tomorow close is 3367.07625415,current date is 2019-02-15 00:00:00
('R2_sore', 0.96953680773848738)

<matplotlib.axes._subplots.AxesSubplot at 0x7f45346c9210>

机器学习用于大盘指数预测

审核消息

该文章已通过审核

全部回复

0/140

热门文章最新文章

热门标签

更多人气分析师

财经资讯

行情数据