#[1]#导入所需要的库import numpy as npimport pandas as pdimport xgboost as xgbfrom xgboost.sklearn import XGBClassifierfrom sklearn.model_selection import GridSearchCVfrom time import timefrom jqfactor import get_factor_valuesfrom jqfactor import get_all_factors
#[2]#数据获取#提取因子数据(特征)get_train_data = pd.DataFrame()get_test_data = pd.DataFrame()train = pd.DataFrame()test = pd.DataFrame()factors = ['VROC12', 'TVMA6', 'VR', 'D*OL5', 'ARBR', 'Variance20', 'Skewness20', 'Kurtosis20', 'sharpe_ratio_20', 'BBIC', 'BIAS5', 'CCI10', 'ROC6', 'single_day_VPT', 'MAC5', 'EMA5', 'MACDC']get_train_data = get_factor_values(normalize_code('600519'), factors, '2007-01-01', '2017-01-03')#2017-01-02是元旦假期,2017-01-03是周二,多取一天,便于处理get_test_data = get_factor_values(normalize_code('600519'), factors, '2017-01-03', '2019-01-02')for x in factors:train[x] = get_train_data[x]['600519.XSHG']test[x] = get_test_data[x]['600519.XSHG']#提取标签数据#取出股价历史数据(比因子数据的时间段向后多一个交易日),并转换成涨跌(后一日收盘价减去前一日收盘价,大于0则为1,小于0则为-1)#注意处理停牌的历史数据!!!#停牌的交易日成交量为0,因此取数据可以同时取价格和成交量,根据成交量过滤停牌日期数据train_p = pd.DataFrame()test_p = pd.DataFrame()train_p = get_price('600519.XSHG', start_date='2007-01-01', end_date='2017-01-03', frequency='daily', fields=['close', 'volume'], skip_paused=False, fq='pre')test_p = get_price('600519.XSHG', start_date='2017-01-03', end_date='2019-01-02', frequency='daily', fields=['close', 'volume'], skip_paused=False, fq='pre')#skip_paused: 是否跳过不交易日期(包括停牌, 未上市或者退市后的日期). 如果不跳过, 停牌时会使用停牌前的数据填充(具体请看SecurityUnitData的paused属性), 上市前或者退市后数据都为 nan, #但要注意:#默认为 False#当 skip_paused 是 True 时, 只能取一只股票的信息#拼接数据train = pd.concat([train, train_p], axis=1)test = pd.concat([test, test_p], axis=1)#删除成交量为0的数据行train = train[train.volume != 0]test = test[test.volume != 0]#print(train.shape[0])#2420行#print(test.shape[0])#488行
#[3]#数据预处理#根据收盘价提取标签train_y_close_p = train['volume']['2007-01-01':'2017-01-02']test_y_close_p = test['volume']['2017-01-03':'2019-01-01']train_y_close_q = train['volume']['2007-01-05':'2017-01-03']test_y_close_q = test['volume']['2017-01-04':'2019-01-02']#先取消索引!!!!!!#df.reset_index()train_y_close_p = train_y_close_p.reset_index()test_y_close_p = test_y_close_p.reset_index()train_y_close_q = train_y_close_q.reset_index()test_y_close_q = test_y_close_q.reset_index()#去掉时间列train_y_close_p = train_y_close_p.drop('index',1)test_y_close_p = test_y_close_p.drop('index',1)train_y_close_q = train_y_close_q.drop('index',1)test_y_close_q = test_y_close_q.drop('index',1)train_y = train_y_close_q - train_y_close_ptest_y = test_y_close_q - test_y_close_p#标签train_y[train_y > 0] = 1train_y[train_y < 0] = 0test_y[test_y > 0] = 1test_y[test_y < 0] = 0#print(train_y.head())#print(test_y.head())#print(train_y.shape[0])#2419行,因为因子数据多了一行#删掉多余的最后一行因子数据,并拼接#print(train.head().append(train.tail()))train = train.reset_index()test = test.reset_index()train = train.drop('index',1)test = test.drop('index',1)#print(train.head().append(train.tail()))#print(test.head().append(test.tail()))train = train.drop(index=2419)test = test.drop(index=487)#删去close列和volume列train = train.drop(['close', 'volume'], 1)test = test.drop(['close','volume'], 1)train = pd.concat([train, train_y], axis=1)test = pd.concat([test, test_y], axis=1)#重命名train.rename(columns = {'volume':'y'}, inplace=True)test.rename(columns = {'volume':'y'}, inplace=True)#到这里,数据就全部处理完了
#[4]#模型构建target = 'y'IDcol = 'id'#使用xgb.cv寻找最优n_estimatorsdef modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):if useTrainCV:xgb_param = alg.get_xgb_params()xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds, verbose_eval=False)alg.set_params(n_estimators=cvresult.shape[0])print(cvresult.shape[0])
#1.先固定学习率为默认值0.1,n_estimators调优start = time()predictors = [x for x in train.columns if x not in [target,IDcol]]xgb1 = XGBClassifier( learning_rate =0.1, n_estimators=100, max_depth=3, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', seed=27)modelfit(xgb1, train, predictors)end = time()time_elapsed = end-startprint('Training is end')print('Training time is {} h.'.format(time_elapsed/3600))
14 Training is end Training time is 0.00031509545114305287 h.
#2.max_depth 和 min_weight 参数调优param_test1 = { 'max_depth':range(3,10,1), 'min_child_weight':range(1,5,1)}gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=14, max_depth=6,min_child_weight=1, gamma=0, subsample=0.8,colsample_bytree=0.8,objective= 'binary:logistic', scale_pos_weight=1, nthread=4, seed=27), param_grid = param_test1,scoring='roc_auc',iid=False, cv=5)gsearch1.fit(train[predictors],train[target])print(gsearch1.best_params_, gsearch1.best_score_)end = time()time_elapsed = end-startprint('Training is end')print('Training time is {} h.'.format(time_elapsed/3600))
{'max_depth': 3, 'min_child_weight': 3} 0.6764277924684329 Training is end Training time is 0.014299993382559882 h.
#3.预测xgb = XGBClassifier( learning_rate =0.1, n_estimators=14, max_depth=3, min_child_weight=3, gamma=0, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', eval_metric='auc', seed=27)#训练xgb.fit(train[predictors], train[target], eval_metric='auc')#预测test['y_predict'] = xgb.predict(test[predictors])
/opt/conda/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty. if diff:
#计算查准率#如果是纯多头操作,当然是查准率更重要#实际涨的股票天数/预测值里涨的股票天数pricision = test[test.y_predict == 1][['y', 'y_predict']]print(pricision.sum())
y 151.0 y_predict 237.0 dtype: float64
本社区仅针对特定人员开放
查看需注册登录并通过风险意识测评
5秒后跳转登录页面...