请 [注册] 或 [登录]  | 返回主站

量化交易吧 /  数理科学 帖子:3364742 新帖:6

机器学习用于大盘指数预测-进阶版

专门亏损发表于:7 月 7 日 17:58回复(1)

新增了集成学习算法用于回归预测,提升了预测准确率。
1.begging及Random Forest算法
2.boosting及xgboost算法

import pandas as pd
import talib as tl
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.svm import LinearSVC, SVC
from sklearn.grid_search import GridSearchCV

from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

数据准备,特征定义

##sma
def SMA(data, ndays,tag): 
 SMA = pd.Series(pd.rolling_mean(data['close'], ndays), name = tag) 
 data[tag] = SMA
 return data
# SMA(df,5)

##bbands 布林线
def BBANDS(data, ndays):

 MA = pd.Series(pd.rolling_mean(data['close'], ndays)) 
 SD = pd.Series(pd.rolling_std(data['close'], ndays))
 b1 = MA + (2 * SD)
 B1 = pd.Series(b1, name = 'Upper BollingerBand') 
 data['ub'] = B1
 b2 = MA - (2 * SD)
 B2 = pd.Series(b2, name = 'Lower BollingerBand') 
#  data = data.join(B2) 
 data['lb'] = B2
 return data
# BBANDS(df,50)

##cci
def CCI(data, ndays): 
 TP = (data['high'] + data['low'] + data['close']) / 3 
 CCI = pd.Series((TP - pd.rolling_mean(TP, ndays)) / (0.015 * pd.rolling_std(TP, ndays)),name = 'CCI') 
#  data = data.join(CCI) 
 data['CCI'] = CCI   
 return data
# CCI(df,20)

##roc
def ROC(data,n):
 N = data['close'].diff(n)
 D = data['close'].shift(n)
 ROC = pd.Series(N/D,name='Rate of Change')
 data['ROC'] = ROC
#  data = data.join(ROC)
 return data
# ROC(df,5)

# Ease of Movement 
def EVM(data, ndays): 
 dm = ((data['high'] + data['low'])/2) - ((data['high'].shift(1) + data['low'].shift(1))/2)
 br = (data['volume'] / 100000000) / ((data['high'] - data['low']))
 EVM = dm / br 
 EVM_MA = pd.Series(pd.rolling_mean(EVM, ndays), name = 'EVM')
 data['EVM']  =  EVM
#  data = data.join(EVM_MA) 
 return data
# EVM(df,14)

# Force Index 
def ForceIndex(data, ndays): 
 FI = pd.Series(data['close'].diff(ndays) * data['volume'], name = 'ForceIndex') 
 data['FI'] = FI
#  data = data.join(FI) 
 return data
# ForceIndex(df,1)

##macd
def MACD(data,short=0,long1=0,mid=0):
    if short==0:
        short=12
    if long1==0:
        long1=26
    if mid==0:
        mid=9
    data['sema']=pd.ewma(data['close'],span=short)
    data['lema']=pd.ewma(data['close'],span=long1)
    data.fillna(0,inplace=True)
    data['macd_dif']=data['sema']-data['lema']
    data['macd_dea']=pd.ewma(data['macd_dif'],span=mid)
#     data['macd']=2*(data['macd_dif']-data['macd_dea'])
    data.fillna(0,inplace=True)
    return data

# MACD(df,0,0,0)

def SMA_CN(close, timeperiod) :
    close = np.nan_to_num(close)
    return reduce(lambda x, y: ((timeperiod - 1) * x + y) / timeperiod, close)

# 同花顺和通达信等软件中的RSI
def RSI_CN(data, timeperiod) :
    close = np.array(data['close'])
    diff = map(lambda x, y : x - y, close[1:], close[:-1])
    diffGt0 = map(lambda x : 0 if x < 0 else x, diff)
    diffABS = map(lambda x : abs(x), diff)
    diff = np.array(diff)
    diffGt0 = np.array(diffGt0)
    diffABS = np.array(diffABS)
    diff = np.append(diff[0], diff)
    diffGt0 = np.append(diffGt0[0], diffGt0)
    diffABS = np.append(diffABS[0], diffABS)
    rsi = map(lambda x : SMA_CN(diffGt0[:x], timeperiod) / SMA_CN(diffABS[:x], timeperiod) * 100
            , range(1, len(diffGt0) + 1) )
    data['RSI'] = rsi
    return data
#RSI_CN(df,14)

##ATR指标主要是用来衡量市场波动的强烈度
def ATR(data,timeperiod):
    close_ATR = np.array(data['close'])
    high_ATR = np.array(data['high'])
    low_ATR = np.array(data['low'])
    atr = tl.ATR(high_ATR, low_ATR, close_ATR, timeperiod)
    data['ATR'] = atr
    return data
# ATR(df,14)

#
def OBV(data):
    obv = tl.OBV(np.array(data['close']),np.array(data['volume']))
    data['OBV'] = obv
    return data
# OBV(df)

#动量指标
def MOM(data):
    mom = tl.MOM(np.array(data['close']), timeperiod=5)
    data['MOM'] = mom
    return data
# MOM(df)
    

def get_tech_data(df):
    data = df.copy()
    SMA(data,5,'sma_5')
    SMA(data,10,'sma_10')
    SMA(data,20,'sma_20')
    SMA(data,30,'sma_30')
    SMA(data,60,'sma_60')
    BBANDS(data,50)
    MACD(data,0,0,0)
    RSI_CN(data,6)
    CCI(data,20)
    ROC(data,5)
    EVM(data,14)
    ForceIndex(data,1)
    ATR(data,14)
    OBV(data)
    MOM(data)
#     data.drop(columns=['open', 'high','close','low','volume','money'])
    data = data.drop('open', 1)
    data = data.drop('high', 1)
    data = data.drop('close', 1)
    data = data.drop('low', 1)
    data = data.drop('volume', 1)
    data = data.drop('money', 1)
    data = data.drop('sema', 1)
    data = data.drop('lema', 1)

    return data
df = get_price('000300.XSHG', end_date='2019-06-30', frequency='daily', fields=['open','high','close','low', 'volume','money']) 
tech_data = get_tech_data(df)
tech_data.tail(10)

comm_data = pd.DataFrame(index = df.index)
comm_data.head()

##取前几天的价格、成交量指标之比
for c in ['open','high','low','volume']:
    for p in [1,2,3]:
       comm_data[c+"diff"+str(p)]=(df[c] - df[c].shift(p)) / df[c].shift(p)
    
comm_data.tail()

##窗口差异
ml_datas = pd.DataFrame(index = df.index)
for w in [5,10,20,30,60]:
    for c in comm_data.columns:
        ml_datas[c+"_win_"+str(w)] = comm_data[c] / (pd.Series(comm_data[c]).rolling(window=w,center=False).max() - comm_data[c].rolling(window=w,center=False).min())
        
# ml_datas.tail(10)   

##构建机器学习数据集
ml_datas = ml_datas.join(tech_data)
##关键一步,将数据左移1天
ml_datas = ml_datas.shift(1)
##明天的收盘价
ml_datas['reg_target'] = df['close']
##明天相比当天的涨跌
ml_datas['clf_target'] = (df['close']/df['close'].shift(1)) - 1 > 0
ml_datas.tail(10)
ml_datas[['sma_10','reg_target','clf_target']].tail(10)

ml_datas = ml_datas.dropna()
ml_datas.describe()


X_ori = ml_datas.drop(['reg_target','clf_target','OBV'],axis = 1)
X_ori.describe()

##当天的收盘价相比昨天的收盘价上涨还是下跌
y = ml_datas['clf_target']
y.describe()

scaler = preprocessing.StandardScaler().fit(X_ori)
X = scaler.transform(X_ori)
X[:10,:]

##Build a forerest and compute feature importance
forest = ExtraTreesClassifier(n_estimators = 250,random_state = 0)
forest.fit(X,y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],axis= 0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")

for f in range(len(indices)):
    print("%d. feature %s (%f)" % (f+1,X_ori.columns[indices[f]],importances[indices[f]]))
    
indices = indices[:20]

plt.figure(figsize=(16,9))
plt.title("feature importance")
plt.bar(range(len(indices)),importances[indices],color='r',yerr=std[indices],align='center')
plt.xticks(range(len(indices)),indices)
plt.xlim([-1,len(indices)])
plt.show()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:3: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=5,center=False).mean()
  This is separate from the ipykernel package so we can avoid doing imports until
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:3: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=10,center=False).mean()
  This is separate from the ipykernel package so we can avoid doing imports until
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:3: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=20,center=False).mean()
  This is separate from the ipykernel package so we can avoid doing imports until
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:3: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=30,center=False).mean()
  This is separate from the ipykernel package so we can avoid doing imports until
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:3: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=60,center=False).mean()
  This is separate from the ipykernel package so we can avoid doing imports until
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:11: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=50,center=False).mean()
  # This is added back by InteractiveShellApp.init_path()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:12: FutureWarning: pd.rolling_std is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=50,center=False).std()
  if sys.path[0] == '':
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:69: FutureWarning: pd.ewm_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.ewm(ignore_na=False,span=12,min_periods=0,adjust=True).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:70: FutureWarning: pd.ewm_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.ewm(ignore_na=False,span=26,min_periods=0,adjust=True).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:73: FutureWarning: pd.ewm_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.ewm(ignore_na=False,span=9,min_periods=0,adjust=True).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:26: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=20,center=False).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:26: FutureWarning: pd.rolling_std is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=20,center=False).std()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:47: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=14,center=False).mean()
Feature ranking:
1. feature volumediff3_win_30 (0.015192)
2. feature RSI (0.014675)
3. feature CCI (0.014637)
4. feature volumediff3_win_60 (0.014632)
5. feature ROC (0.014451)
6. feature lb (0.014261)
7. feature opendiff1_win_5 (0.014242)
8. feature volumediff2_win_60 (0.014173)
9. feature opendiff1_win_60 (0.014170)
10. feature volumediff3_win_20 (0.014139)
11. feature volumediff3_win_10 (0.014102)
12. feature MOM (0.014039)
13. feature volumediff1_win_5 (0.014023)
14. feature opendiff1_win_30 (0.013972)
15. feature volumediff3_win_5 (0.013943)
16. feature macd_dea (0.013851)
17. feature volumediff1_win_20 (0.013722)
18. feature highdiff3_win_10 (0.013617)
19. feature volumediff1_win_10 (0.013617)
20. feature highdiff1_win_20 (0.013546)
21. feature volumediff1_win_30 (0.013507)
22. feature opendiff2_win_5 (0.013475)
23. feature highdiff2_win_10 (0.013420)
24. feature volumediff1_win_60 (0.013389)
25. feature lowdiff1_win_10 (0.013377)
26. feature highdiff1_win_10 (0.013338)
27. feature highdiff3_win_30 (0.013295)
28. feature lowdiff2_win_5 (0.013289)
29. feature lowdiff2_win_60 (0.013198)
30. feature highdiff1_win_60 (0.013193)
31. feature EVM (0.013190)
32. feature opendiff3_win_5 (0.013172)
33. feature sma_20 (0.013171)
34. feature highdiff1_win_30 (0.013160)
35. feature volumediff2_win_20 (0.013150)
36. feature highdiff3_win_5 (0.013143)
37. feature sma_30 (0.013135)
38. feature lowdiff1_win_20 (0.013119)
39. feature highdiff2_win_5 (0.013113)
40. feature lowdiff3_win_5 (0.013112)
41. feature opendiff1_win_20 (0.013077)
42. feature volumediff2_win_30 (0.013072)
43. feature opendiff2_win_10 (0.012966)
44. feature opendiff1_win_10 (0.012929)
45. feature opendiff3_win_60 (0.012910)
46. feature lowdiff3_win_10 (0.012905)
47. feature opendiff3_win_30 (0.012879)
48. feature ATR (0.012831)
49. feature lowdiff1_win_30 (0.012809)
50. feature lowdiff2_win_20 (0.012796)
51. feature macd_dif (0.012790)
52. feature lowdiff3_win_60 (0.012764)
53. feature opendiff2_win_30 (0.012763)
54. feature opendiff2_win_20 (0.012761)
55. feature sma_5 (0.012751)
56. feature volumediff2_win_10 (0.012699)
57. feature sma_10 (0.012698)
58. feature highdiff1_win_5 (0.012646)
59. feature lowdiff1_win_5 (0.012639)
60. feature highdiff2_win_20 (0.012623)
61. feature volumediff2_win_5 (0.012612)
62. feature FI (0.012565)
63. feature sma_60 (0.012537)
64. feature lowdiff2_win_30 (0.012485)
65. feature highdiff3_win_20 (0.012473)
66. feature opendiff3_win_20 (0.012416)
67. feature highdiff2_win_60 (0.012382)
68. feature ub (0.012334)
69. feature lowdiff3_win_30 (0.012324)
70. feature highdiff2_win_30 (0.012282)
71. feature highdiff3_win_60 (0.012242)
72. feature lowdiff1_win_60 (0.012113)
73. feature opendiff3_win_10 (0.011946)
74. feature lowdiff2_win_10 (0.011733)
75. feature lowdiff3_win_20 (0.011696)
76. feature opendiff2_win_60 (0.011601)

下一天涨跌分类预测

###对沪深300指数进行预测分类
start = '2017-01-01'
X_train = X_ori[X_ori.index<start]
X_test = X_ori[X_ori.index>=start]
y_train = y[y.index<start]
y_test = y[y.index>=start]
print X_train.shape,y_train.shape,X_test.shape,y_test.shape

##基线
# model = LinearRegression()
# model.fit(X_train,y_train)
# y_pred = model.predict(X_test)
#注意结果是bool值
# score = r2_score(y_pred,y_test)
# print 'LinearRegression Score:',confusion_matrix(y_pred, y_test)

models = [("LR", LogisticRegression()), 
              ("LDA", LDA()), 
              ("QDA", QDA()),
              ("LSVC", LinearSVC()),
              ("RSVM", SVC(
              	C=1000000.0, cache_size=200, class_weight=None,
                coef0=0.0, degree=3, gamma=0.0001, kernel='rbf',
                max_iter=-1, probability=False, random_state=None,
                shrinking=True, tol=0.001, verbose=False)
              ),
              ("RF", RandomForestClassifier(
              	n_estimators=1000, criterion='gini', 
                max_depth=None, min_samples_split=2, 
                min_samples_leaf=1, max_features='auto', 
                bootstrap=True, oob_score=False, n_jobs=1, 
                random_state=None, verbose=0)
              )]

    # Iterate through the models
for m in models:

    # Train each of the models on the training set
    m[1].fit(X_train, y_train)

    # Make an array of predictions on the test set
    pred = m[1].predict(X_test)

    # Output the hit-rate and the confusion matrix for each model
    print("%s:\n%0.3f" % (m[0], m[1].score(X_test, y_test)))
    print("%s\n" % confusion_matrix(pred, y_test))
#     print("%s\n" % r2_score(y_test,pred))

    
##网格搜索+交叉验证
# tuned_parameters = [
#     {'n_estimators': [500,1000],'min_samples_split':[5],'min_samples_leaf':[1]}
# ]
# model = GridSearchCV(RandomForestClassifier(),tuned_parameters,cv=10)
# model.fit(X_train, y_train)

# print("Optimised parameters found on training set:")
# print(model.best_estimator_, "\n")

# print("Grid scores calculated on training set:")
# for params, mean_score, scores in model.grid_scores_:
#     print("%0.3f for %r" % (mean_score, params))
(425, 76) (425,) (605, 76) (605,)
LR:
0.509
[[144 148]
 [149 164]]

LDA:
0.494
[[184 197]
 [109 115]]

QDA:
0.496
[[128 140]
 [165 172]]

LSVC:
0.491
[[149 164]
 [144 148]]

RSVM:
0.516
[[0 0]
 [293 312]]

RF:
0.519
[[128 126]
 [165 186]]

开始进行回归

##回归
X_ori = ml_datas.drop(['reg_target','clf_target','OBV'],axis=1)
y = ml_datas['reg_target']
start = '2017-01-01'
X_train = X_ori[X_ori.index<start]
X_test = X_ori[X_ori.index>=start]
y_train = y[y.index<start]
y_test = y[y.index>=start]
tuned_parameters = [
    {'alpha':[1,0.5,0.1,0.01,0.001]}
]
##岭回归,网格搜索调参法
model = GridSearchCV(Ridge(),tuned_parameters,cv=10)
model.fit(X_train, y_train)

print("Optimised parameters found on training set:")
print(model.best_estimator_, "\n")

print("Grid scores calculated on training set:")
for params, mean_score, scores in model.grid_scores_:
    print("%0.3f for %r" % (mean_score, params))
    
y_pred = model.predict(X_test)
print("tomorow close is %s,current date is %s" % (y_pred[-1],X_test.index[-1]))
print("R2_sore",r2_score(y_test,y_pred))

df_result = pd.DataFrame(index = y_test.index)

df_result['True value'] = y_test
df_result['Pred value'] = y_pred
df_result.plot(figsize=(16,9))
Optimised parameters found on training set:
(Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001), '\n')
Grid scores calculated on training set:
0.648 for {'alpha': 1}
0.628 for {'alpha': 0.5}
0.571 for {'alpha': 0.1}
0.485 for {'alpha': 0.01}
0.445 for {'alpha': 0.001}
tomorow close is 3781.00709633,current date is 2019-06-28 00:00:00
('R2_sore', 0.96604132097418804)
<matplotlib.axes._subplots.AxesSubplot at 0x7fd60c083a90>

目前基准的R2_sore为0.96604132097418804,接下来使用一些高阶的版本降低之。 目前已知alpha为0.001为较优秀参数

首先使用bagging, Bagging把很多的小分类器放在一起,每个train随机的一部分数据,然后把它们的最终结果综合起来(多数投票制)。

from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge

ridge = Ridge(0.001)
params = [1, 10, 15, 20, 25, 30, 40]
test_scores = []
for param in params:
    clf = BaggingRegressor(n_estimators=param, base_estimator=ridge)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))
    
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(params, test_scores)
plt.title("n_estimator vs CV Error");
##begging预测一把
beg_clf = BaggingRegressor(n_estimators=10, base_estimator=ridge)
beg_clf.fit(X_train, y_train)
beg_y_pred = beg_clf.predict(X_test)
print("beg tomorow close is %s,current date is %s" % (beg_y_pred[-1],X_test.index[-1]))
print("beg R2_sore",r2_score(y_test,beg_y_pred))

beg_df_result = pd.DataFrame(index = y_test.index)

beg_df_result['True value'] = y_test
beg_df_result['Pred value'] = beg_y_pred
beg_df_result.plot(figsize=(16,9))
beg tomorow close is 3826.79950296,current date is 2019-06-28 00:00:00
('beg R2_sore', 0.95443014991895758)
<matplotlib.axes._subplots.AxesSubplot at 0x7fd60c122b50>

begging的预测均方误差下降到了0.955左右

再试试普通的决策树

params = [10, 15, 20, 25, 30, 40, 50, 60, 70, 100]
test_scores = []
for param in params:
    clf = BaggingRegressor(n_estimators=param)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(params, test_scores)
plt.title("n_estimator vs CV Error");

普通决策树效果确实不咋好,Boosting比Bagging理论上更高级点,它也是揽来一把的分类器。但是把他们线性排列。下一个分类器把上一个分类器分类得不好的地方加上更高的权重,这样下一个分类器就能在这个部分学得更加“深刻”。

from xgboost import XGBRegressor
params = [1,2,3,4,5,6]
test_scores = []
for param in params:
    clf = XGBRegressor(max_depth=param)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(params, test_scores)
plt.title("max_depth vs CV Error");

使用两层xgboost进行回归

xg_clf = XGBRegressor(max_depth=2)
xg_clf.fit(X_train, y_train)
xg_y_pred = xg_clf.predict(X_test)
print("xg tomorow close is %s,current date is %s" % (xg_y_pred[-1],X_test.index[-1]))
print("xg R2_sore",r2_score(y_test,xg_y_pred))

xg_df_result = pd.DataFrame(index = y_test.index)

xg_df_result['True value'] = y_test
xg_df_result['Pred value'] = xg_y_pred
xg_df_result.plot(figsize=(16,9))
xg tomorow close is 3749.21,current date is 2019-06-28 00:00:00
('xg R2_sore', 0.92536326370539124)
<matplotlib.axes._subplots.AxesSubplot at 0x7fd60c3ca190>

均方根误差下降到了0.925

 

全部回复

0/140

量化课程

    移动端课程