运用多个因子综合进行机器学习预测
import pandas as pd
import talib as tl
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.svm import LinearSVC, SVC
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
##sma
def SMA(data, ndays,tag):
SMA = pd.Series(pd.rolling_mean(data['close'], ndays), name = tag)
data[tag] = SMA
return data
# SMA(df,5)
##bbands
def BBANDS(data, ndays):
MA = pd.Series(pd.rolling_mean(data['close'], ndays))
SD = pd.Series(pd.rolling_std(data['close'], ndays))
b1 = MA + (2 * SD)
B1 = pd.Series(b1, name = 'Upper BollingerBand')
data['ub'] = B1
b2 = MA - (2 * SD)
B2 = pd.Series(b2, name = 'Lower BollingerBand')
# data = data.join(B2)
data['lb'] = B2
return data
# BBANDS(df,50)
##cci
def CCI(data, ndays):
TP = (data['high'] + data['low'] + data['close']) / 3
CCI = pd.Series((TP - pd.rolling_mean(TP, ndays)) / (0.015 * pd.rolling_std(TP, ndays)),name = 'CCI')
# data = data.join(CCI)
data['CCI'] = CCI
return data
# CCI(df,20)
##roc
def ROC(data,n):
N = data['close'].diff(n)
D = data['close'].shift(n)
ROC = pd.Series(N/D,name='Rate of Change')
data['ROC'] = ROC
# data = data.join(ROC)
return data
# ROC(df,5)
# Ease of Movement
def EVM(data, ndays):
dm = ((data['high'] + data['low'])/2) - ((data['high'].shift(1) + data['low'].shift(1))/2)
br = (data['volume'] / 100000000) / ((data['high'] - data['low']))
EVM = dm / br
EVM_MA = pd.Series(pd.rolling_mean(EVM, ndays), name = 'EVM')
data['EVM'] = EVM
# data = data.join(EVM_MA)
return data
# EVM(df,14)
# Force Index
def ForceIndex(data, ndays):
FI = pd.Series(data['close'].diff(ndays) * data['volume'], name = 'ForceIndex')
data['FI'] = FI
# data = data.join(FI)
return data
# ForceIndex(df,1)
##macd
def MACD(data,short=0,long1=0,mid=0):
if short==0:
short=12
if long1==0:
long1=26
if mid==0:
mid=9
data['sema']=pd.ewma(data['close'],span=short)
data['lema']=pd.ewma(data['close'],span=long1)
data.fillna(0,inplace=True)
data['macd_dif']=data['sema']-data['lema']
data['macd_dea']=pd.ewma(data['macd_dif'],span=mid)
# data['macd']=2*(data['macd_dif']-data['macd_dea'])
data.fillna(0,inplace=True)
return data
# MACD(df,0,0,0)
def SMA_CN(close, timeperiod) :
close = np.nan_to_num(close)
return reduce(lambda x, y: ((timeperiod - 1) * x + y) / timeperiod, close)
# 同花顺和通达信等软件中的RSI
def RSI_CN(data, timeperiod) :
close = np.array(data['close'])
diff = map(lambda x, y : x - y, close[1:], close[:-1])
diffGt0 = map(lambda x : 0 if x < 0 else x, diff)
diffABS = map(lambda x : abs(x), diff)
diff = np.array(diff)
diffGt0 = np.array(diffGt0)
diffABS = np.array(diffABS)
diff = np.append(diff[0], diff)
diffGt0 = np.append(diffGt0[0], diffGt0)
diffABS = np.append(diffABS[0], diffABS)
rsi = map(lambda x : SMA_CN(diffGt0[:x], timeperiod) / SMA_CN(diffABS[:x], timeperiod) * 100
, range(1, len(diffGt0) + 1) )
data['RSI'] = rsi
return data
#RSI_CN(df,14)
##ATR指标主要是用来衡量市场波动的强烈度
def ATR(data,timeperiod):
close_ATR = np.array(data['close'])
high_ATR = np.array(data['high'])
low_ATR = np.array(data['low'])
atr = tl.ATR(high_ATR, low_ATR, close_ATR, timeperiod)
data['ATR'] = atr
return data
# ATR(df,14)
def OBV(data):
obv = tl.OBV(np.array(data['close']),np.array(data['volume']))
data['OBV'] = obv
return data
# OBV(df)
def MOM(data):
mom = tl.MOM(np.array(data['close']), timeperiod=5)
data['MOM'] = mom
return data
# MOM(df)
def get_tech_data(df):
data = df.copy()
SMA(data,5,'sma_5')
SMA(data,10,'sma_10')
SMA(data,20,'sma_20')
SMA(data,30,'sma_30')
SMA(data,60,'sma_60')
BBANDS(data,50)
MACD(data,0,0,0)
RSI_CN(data,6)
CCI(data,20)
ROC(data,5)
EVM(data,14)
ForceIndex(data,1)
ATR(data,14)
OBV(data)
MOM(data)
# data.drop(columns=['open', 'high','close','low','volume','money'])
data = data.drop('open', 1)
data = data.drop('high', 1)
data = data.drop('close', 1)
data = data.drop('low', 1)
data = data.drop('volume', 1)
data = data.drop('money', 1)
data = data.drop('sema', 1)
data = data.drop('lema', 1)
return data
df = get_price('000300.XSHG', end_date='2019-02-17', frequency='daily', fields=['open','high','close','low', 'volume','money'])
tech_data = get_tech_data(df)
tech_data.tail(10)
comm_data = pd.DataFrame(index = df.index)
comm_data.head()
for c in ['open','high','low','volume']:
for p in [1,2,3]:
comm_data[c+"diff"+str(p)]=(df[c] - df[c].shift(p)) / df[c].shift(p)
comm_data.tail()
##窗口差异
ml_datas = pd.DataFrame(index = df.index)
for w in [5,10,20,30,60]:
for c in comm_data.columns:
ml_datas[c+"_win_"+str(w)] = comm_data[c] / (pd.Series(comm_data[c]).rolling(window=w,center=False).max() - comm_data[c].rolling(window=w,center=False).min())
# ml_datas.tail(10)
##构建机器学习数据集
ml_datas = ml_datas.join(tech_data)
##关键一步,将数据左移1天
ml_datas = ml_datas.shift(1)
##明天的收盘价
ml_datas['reg_target'] = df['close']
##明天相比当天的涨跌
ml_datas['clf_target'] = (df['close']/df['close'].shift(1)) - 1 > 0
ml_datas.tail(10)
ml_datas[['sma_10','reg_target','clf_target']].tail(10)
ml_datas = ml_datas.dropna()
ml_datas.describe()
X_ori = ml_datas.drop(['reg_target','clf_target','OBV'],axis = 1)
X_ori.describe()
##当天的收盘价相比昨天的收盘价上涨还是下跌
y = ml_datas['clf_target']
y.describe()
scaler = preprocessing.StandardScaler().fit(X_ori)
X = scaler.transform(X_ori)
X[:10,:]
##Build a forerest and compute feature importance
forest = ExtraTreesClassifier(n_estimators = 250,random_state = 0)
forest.fit(X,y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],axis= 0)
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(len(indices)):
print("%d. feature %s (%f)" % (f+1,X_ori.columns[indices[f]],importances[indices[f]]))
indices = indices[:20]
plt.figure(figsize=(16,9))
plt.title("feature importance")
plt.bar(range(len(indices)),importances[indices],color='r',yerr=std[indices],align='center')
plt.xticks(range(len(indices)),indices)
plt.xlim([-1,len(indices)])
plt.show()
###对沪深300指数进行预测分类
start = '2017-01-01'
X_train = X_ori[X_ori.index<start]
X_test = X_ori[X_ori.index>=start]
y_train = y[y.index<start]
y_test = y[y.index>=start]
print X_train.shape,y_train.shape,X_test.shape,y_test.shape
##基线
# model = LinearRegression()
# model.fit(X_train,y_train)
# y_pred = model.predict(X_test)
#注意结果是bool值
# score = r2_score(y_pred,y_test)
# print 'LinearRegression Score:',confusion_matrix(y_pred, y_test)
models = [("LR", LogisticRegression()),
("LDA", LDA()),
("QDA", QDA()),
("LSVC", LinearSVC()),
("RSVM", SVC(
C=1000000.0, cache_size=200, class_weight=None,
coef0=0.0, degree=3, gamma=0.0001, kernel='rbf',
max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False)
),
("RF", RandomForestClassifier(
n_estimators=1000, criterion='gini',
max_depth=None, min_samples_split=2,
min_samples_leaf=1, max_features='auto',
bootstrap=True, oob_score=False, n_jobs=1,
random_state=None, verbose=0)
)]
# Iterate through the models
for m in models:
# Train each of the models on the training set
m[1].fit(X_train, y_train)
# Make an array of predictions on the test set
pred = m[1].predict(X_test)
# Output the hit-rate and the confusion matrix for each model
print("%s:\n%0.3f" % (m[0], m[1].score(X_test, y_test)))
print("%s\n" % confusion_matrix(pred, y_test))
# print("%s\n" % r2_score(y_test,pred))
##网格搜索+交叉验证
# tuned_parameters = [
# {'n_estimators': [500,1000],'min_samples_split':[5],'min_samples_leaf':[1]}
# ]
# model = GridSearchCV(RandomForestClassifier(),tuned_parameters,cv=10)
# model.fit(X_train, y_train)
# print("Optimised parameters found on training set:")
# print(model.best_estimator_, "\n")
# print("Grid scores calculated on training set:")
# for params, mean_score, scores in model.grid_scores_:
# print("%0.3f for %r" % (mean_score, params))
##回归
X_ori = ml_datas.drop(['reg_target','clf_target','OBV'],axis=1)
y = ml_datas['reg_target']
start = '2017-01-01'
X_train = X_ori[X_ori.index<start]
X_test = X_ori[X_ori.index>=start]
y_train = y[y.index<start]
y_test = y[y.index>=start]
tuned_parameters = [
{'alpha':[1,0.5,0.1,0.01,0.001]}
]
##岭回归
model = GridSearchCV(Ridge(),tuned_parameters,cv=10)
model.fit(X_train, y_train)
print("Optimised parameters found on training set:")
print(model.best_estimator_, "\n")
print("Grid scores calculated on training set:")
for params, mean_score, scores in model.grid_scores_:
print("%0.3f for %r" % (mean_score, params))
y_pred = model.predict(X_test)
print("tomorow close is %s,current date is %s" % (y_pred[-1],X_test.index[-1]))
print("R2_sore",r2_score(y_test,y_pred))
df_result = pd.DataFrame(index = y_test.index)
df_result['True value'] = y_test
df_result['Pred value'] = y_pred
df_result.plot(figsize=(16,9))