繁簡切換您正在訪問的是FX168財經網,本網站所提供的內容及信息均遵守中華人民共和國香港特別行政區當地法律法規。

FX168财经网>人物频道>帖子

机器学习(深度学习)策略开发代码框架(基于tensor

作者/sdeewew 2019-05-10 07:24 0 来源: FX168财经网人物频道

近期在研究利用机器学习和深度学习进行策略开发,试图探索市场深层规律。
写了一个简单的代码框架,包括数据获取,数据处理,特征提取,利用机器学习和深度学习算法分类。代码最后的分类部分只做了部分调参工作,在此基础上进一步细化也比较容易。
本文数据是随意获取的,重在方法和思路,不重结论,因为得到有用的结论需要在数据构造、特征提取以及最后的分类调参上做大量细化工作,而且真正有用的结论是核心秘密,一般也不会分享。
除了第一部分数据获取要利用聚宽平台的数据库,其他模块建议在自己机器上跑,数据存储和读取的模块也已经写好了,直接复制到本地就能跑。

希望能对刚接触机器学习的小伙伴有一点儿作用。

数据获取¶

import xlrdfrom jqfactor import *import pandas as pdimport numpy as npfrom multiprocessing.dummy import Pool as ThreadPoolfrom jqdata import *import time
start_clock = time.clock()ExcellFile = xlrd.open_workbook('FactorTable.xlsx')name = ExcellFile.sheet_names()sheet = ExcellFile.sheet_by_name(name[0])factor_quality = list(sheet.col_values(1))factor_fundamental = list(sheet.col_values(2))[:28]factor_mood = list(sheet.col_values(3)[:35])factor_growth = list(sheet.col_values(4))[:8]factor_risk=list(sheet.col_values(5))[:12]factor_stock=list(sheet.col_values(6))[:15]
stocks = ['000001.XSHE']start_date = '2015-01-01'end_date = '2018-11-08'factor_list=factor_quality+factor_fundamental+factor_mood+factor_growth+factor_risk+factor_stocktrade_date_list = get_trade_days(start_date,end_date)
def get_jq_factor_values():factor_dict = get_factor_values(stocks,factor_list,start_date=start_date,end_date=end_date)return factor_dictres = get_jq_factor_values()
def get_raw_factor_dict():pool = ThreadPool(processes = len(trade_date_list))frame_list = pool.map(get_jq_factor_values, trade_date_list)pool.close()pool.join()return frame_list
import picklep_file = open('all_factors.pkl','wb')pickle.dump(res,p_file)p_file.close()
def get_day_profit(stocks,end_date,start_date=None,count=-1,pre_num=1):'''    获取每天的收益率    input:    stocks:list or Series,股票代码    start_date:开始时间    end_date:结束时间    count:与start_date二选一,向前取值个数    pre_num:int,向前计算的天数    output:    profit:dataframe,index为日期,values为收益率,收益率大于0标记为1,否则为0    '''if count == -1:price = get_price(stocks,start_date,end_date,fields=['close'])['close']else:price = get_price(stocks,end_date=end_date,count=count,fields=['close'])['close']profit = price.pct_change(periods=pre_num).dropna()profit[profit > 0] = 1profit[profit < 0] = 0profit.columns=['profit_class']return profitprofit = get_day_profit(stocks,end_date,start_date)
p_file = open('profit.pkl','wb')pickle.dump(profit,p_file)p_file.close()

数据处理与特征选择¶

import xlrdimport pickleimport pandas as pdimport numpy as npfrom sklearn.feature_selection import SelectKBest,SelectPercentile,SelectFromModel,chi2,f_classif,mutual_info_classif,RFEfrom scipy.stats import pearsonrfrom sklearn.ensemble import RandomForestRegressor,RandomForestClassifierfrom sklearn.svm import SVC,LinearSVC,LinearSVR,SVRfrom sklearn.tree import DecisionTreeClassifierimport lightgbm as lgbfrom sklearn.model_selection import train_test_splitimport gcimport pickle
with open('all_factors.pkl','rb') as pl_file:frame_list = pickle.load(pl_file)factor_df = pd.concat(frame_list,axis=1)with open('profit.pkl','rb') as pl_file:profit = pickle.load(pl_file)profit_shift = profit.shift(-1)data = pd.concat([factor_df,profit_shift],axis=1)
data_dn1 = data.dropna(axis=1,how='all')data_dn0 = data_dn1.dropna(how='all')data_dn = data_dn0.fillna(data_dn0.mean())data_df = data_dn.drop([data_dn.index[0],data_dn.index[-1]])columns = data_df.columnsdata_x = data_df[columns[:-1]]data_y = data_df[columns[-1]]
def winsorize_and_standarlize(data,qrange=[0.05,0.95],axis=0):'''    input:    data:Dataframe or series,输入数据    qrange:list,list[0]下分位数,list[1],上分位数,极值用分位数代替    '''if isinstance(data,pd.DataFrame):if axis == 0:q_down = data.quantile(qrange[0])q_up = data.quantile(qrange[1])index = data.indexcol = data.columnsfor n in col:data[n][data[n] > q_up[n]] = q_up[n]data[n][data[n] < q_down[n]] = q_down[n]data = (data - data.mean())/data.std()data = data.fillna(0)else:data = data.stack()data = data.unstack(0)q = data.quantile(qrange)index = data.indexcol = data.columnsfor n in col:data[n][data[n] > q[n]] = q[n]data = (data - data.mean())/data.std()data = data.stack().unstack(0)data = data.fillna(0)elif isinstance(data,pd.Series):name = data.nameq = data.quantile(qrange)data[data>q] = qdata = (data - data.mean())/data.std()return datadata_ws = winsorize_and_standarlize(data_x)
class FeatureSelection():'''    特征选择:    identify_collinear:基于相关系数,删除小于correlation_threshold的特征    identify_importance_lgbm:基于LightGBM算法,得到feature_importance,选择和大于p_importance的特征    filter_select:单变量选择,指定k,selectKBest基于method提供的算法选择前k个特征,selectPercentile选择前p百分百的特征    wrapper_select:RFE,基于estimator递归特征消除,保留n_feature_to_select个特征    '''def __init__(self):self.supports_filter = None #bool型,特征是否被选中self.supports_wrapper = Noneself.supports_embedded = Noneself.columns_lgbm = None  #选择的特征self.columns_filter = Noneself.columns_wrapper = Noneself.columns_embedded = Noneself.record_collinear = None #自相关矩阵大于门限值def identify_collinear(self, data, correlation_threshold):columns = data.columnsself.correlation_threshold = correlation_threshold# Calculate the correlations between every columncorr_matrix = data.corr()self.corr_matrix = corr_matrix# Extract the upper triangle of the correlation matrixupper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(np.bool))# Select the features with correlations above the threshold# Need to use the absolute valueto_drop = [column for column in upper.columns if any(upper[column].abs() > correlation_threshold)]obtain_columns = [column for column in columns if column not in to_drop]self.columns = obtain_columns# Dataframe to hold correlated pairsrecord_collinear = pd.DataFrame(columns = ['drop_feature', 'corr_feature', 'corr_value'])# Iterate through the columns to dropfor column in to_drop:# Find the correlated featurescorr_features = list(upper.index[upper[column].abs() > correlation_threshold])# Find the correlated valuescorr_values = list(upper[column][upper[column].abs() > correlation_threshold])drop_features = [column for _ in range(len(corr_features))]    # Record the information (need a temp df for now)temp_df = pd.DataFrame.from_dict({'drop_feature': drop_features, 'corr_feature': corr_features, 'corr_value': corr_values})# Add to dataframerecord_collinear = record_collinear.append(temp_df, ignore_index = True)self.record_collinear = record_collinearreturn data[obtain_columns] 
        def identify_importance_lgbm(self, features, labels,p_importance=0.8, eval_metric='auc', task='classification', 
                                 n_iterations=10, early_stopping = True):# One hot encodingdata = featuresfeatures = pd.get_dummies(features)# Extract feature namesfeature_names = list(features.columns)# Convert to np arrayfeatures = np.array(features)labels = np.array(labels).reshape((-1, ))# Empty array for feature importancesfeature_importance_values = np.zeros(len(feature_names))print('Training Gradient Boosting Model\n')# Iterate through each foldfor _ in range(n_iterations):if task == 'classification':model = lgb.LGBMClassifier(n_estimators=100, learning_rate = 0.05, verbose = -1)elif task == 'regression':model = lgb.LGBMRegressor(n_estimators=100, learning_rate = 0.05, verbose = -1)else:raise ValueError('Task must be either "classification" or "regression"')# If training using early stopping need a validation setif early_stopping:train_features, valid_features, train_labels, valid_labels = train_test_split(features, labels, test_size = 0.15)# Train the model with early stoppingmodel.fit(train_features, train_labels, eval_metric = eval_metric,  eval_set = [(valid_features, valid_labels)],   verbose = -1)# Clean up memorygc.enable()del train_features, train_labels, valid_features, valid_labelsgc.collect()else:model.fit(features, labels)# Record the feature importancesfeature_importance_values += model.feature_importances_ / n_iterationsfeature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})# Sort features according to importancefeature_importances = feature_importances.sort_values('importance', ascending = False).reset_index(drop = True)# Normalize the feature importances to add up to onefeature_importances['normalized_importance'] = feature_importances['importance'] / feature_importances['importance'].sum()feature_importances['cumulative_importance'] = np.cumsum(feature_importances['normalized_importance'])select_df = feature_importances[feature_importances['cumulative_importance']<=p_importance]select_columns = select_df['feature']self.columns_lgbm = list(select_columns.values)res = data[self.columns]return resdef filter_select(self, data_x, data_y, k=None, p=50,method=f_classif):columns = data_x.columnsif k != None:model = SelectKBest(method,k)res = model.fit_transform(data_x,data_y)supports = model.get_support()else:model = SelectPercentile(method,p)res = model.fit_transform(data_x,data_y)supports = model.get_support()self.supports_filter = supportsself.columns_filter = columns[supports]return resdef wrapper_select(self,data_x,data_y,n,estimator):columns = data_x.columnsmodel = RFE(estimator=estimator,n_features_to_select=n)res = model.fit_transform(data_x,data_y)supports = model.get_support() #标识被选择的特征在原数据中的位置self.supports_wrapper = supportsself.columns_wrapper = columns[supports]return resdef embedded_select(self,data_x,data_y,estimator,threshold=None):columns = data_x.columnsmodel = SelectFromModel(estimator=estimator,prefit=False,threshold=threshold)res = model.fit_transform(data_x,data_y)supports = model.get_support()self.supports_embedded = supportsself.columns_embedded = columns[supports]return res
f = FeatureSelection()data_ncol = f.identify_collinear(data_ws,0.8)data_lgbm = f.identify_importance_lgbm(data_ncol,data_y.values)data_filter = f.filter_select(data_ncol,data_y)estimator = LinearSVC()n=int(0.5*len(data_ncol.columns))data_wrapper = f.wrapper_select(data_ncol,data_y,n,estimator)estimator_embedded = LinearSVC(penalty='l1')data_embedded = f.embedded_select(data_ncol,data_y,estimator)l = [data_x,data_y,data_ws,data_ncol,data_lgbm,data_filter,data_wrapper,data_embedded]
with open('feature_selection_res.plckle','wb') as pk_file:pickle.dump(l,pk_file)

利用机器学习和深度学习算法分类¶

import pandas as pdimport numpy as npimport picklefrom sklearn.svm import SVC,LinearSVCfrom sklearn.ensemble import RandomForestClassifierimport lightgbm as lgb from sklearn.model_selection import train_test_split,GridSearchCVimport tensorflow as tfimport warningswarnings.filterwarnings('ignore')
with open('feature_selection_res.plckle','rb') as pk_file:pk_data = pickle.load(pk_file)data_x = pk_data[0]data_y = pk_data[1]data_ws = pk_data[2]data_ncol = pk_data[3]data_lgbm = pk_data[4]data_filter = pk_data[5]data_wrapper = pk_data[6]data_embedded = pk_data[7]
lgbm_train_x,lgbm_test_x,lgbm_train_y,lgbm_test_y = train_test_split(data_lgbm,data_y,test_size=0.3)embedded_train_x,embedded_test_x,embedded_train_y,embedded_test_y = train_test_split(data_embedded,data_y,test_size=0.3)wrapper_train_x,wrapper_test_x,wrapper_train_y,wrapper_test_y = train_test_split(data_wrapper,data_y,test_size=0.3)
lgbm_svc = SVC(max_iter=1000)lgbm_svc_params = {'C':[1,1.5,1.8,2],'kernel':['rbf','linear','poly','sigmoid']}lgbm_svc_model = GridSearchCV(estimator=lgbm_svc,param_grid=lgbm_svc_params,scoring='roc_auc')lgbm_svc_model.fit(lgbm_train_x,lgbm_train_y)print(lgbm_svc_model.best_score_)print(lgbm_svc_model.best_params_)
embedded_svc = SVC(max_iter=100)embedded_svc_params = {'C':[0.5,1,1.5,2,2.5],'kernel':['rbf','linear','poly','sigmoid']}embedded_svc_model = GridSearchCV(estimator=embedded_svc,param_grid=embedded_svc_params,scoring='roc_auc')embedded_svc_model.fit(embedded_train_x,embedded_train_y)print(embedded_svc_model.best_score_)print(embedded_svc_model.best_params_)
lgbm_rf = RandomForestClassifier(max_depth=5)lgbm_rf_params = {'n_estimators':range(5,15,3),'min_samples_split':range(2,20,2)}lgbm_rf_model = GridSearchCV(estimator=lgbm_rf,param_grid=lgbm_rf_params,scoring='roc_auc')lgbm_rf_model.fit(lgbm_train_x,lgbm_train_y)print(lgbm_rf_model.best_score_)print(lgbm_rf_model.best_params_)
keep_prob = 0.5LR = 0.1l1_num = 30l2_num = 1batch_size = 10times = 1000v1 = np.array(lgbm_train_y.values)lgbm_train_y_a = v1.reshape((-1,1))v2 = np.array(lgbm_test_y.values)lgbm_test_y_a = v2.reshape((-1,1))n_features = len(lgbm_test_x.columns)x = tf.placeholder(tf.float32,[None,n_features])y = tf.placeholder(tf.float32,[None,1])def weights(shape):weights = tf.Variable(tf.truncated_normal(shape,stddev=0.1))return weightsdef biases(shape):biases = tf.zeros(shape) + 0.1return tf.Variable(biases)weights1 = weights([n_features,l1_num])biases1 = biases([1,l1_num])w_plus_b1 = tf.matmul(x,weights1) + biases1l1 = tf.nn.relu(w_plus_b1)l1_dropout = tf.nn.dropout(l1,keep_prob=keep_prob)weight2 = weights([l1_num,l2_num])biases2 = biases([l2_num])w_plus_b2 = tf.matmul(l1_dropout,weight2) + biases2l2 = tf.nn.relu(w_plus_b2)l2_dropout = tf.nn.dropout(l2,keep_prob=keep_prob)weight3 = weights([l2_num,1])biases3 = biases([1])w_plus_b3 = tf.matmul(l2_dropout,weight3) + biases3l3 = tf.nn.relu(w_plus_b3)res = tf.nn.sigmoid(l3)loss = tf.reduce_mean(tf.square(y - res))train_steps = tf.train.AdadeltaOptimizer(LR).minimize(loss)initialize = tf.global_variables_initializer()with tf.Session() as sess:sess.run(initialize)for batch in range(batch_size):for time in range(times):sess.run(train_steps,feed_dict={x:lgbm_train_x,y:lgbm_train_y_a})accuracy = sess.run(loss,feed_dict={x:lgbm_train_x,y:lgbm_train_y_a})accuracy_test = sess.run(loss,feed_dict={x:lgbm_test_x,y:lgbm_test_y_a})print('train loss is '+ str(batch) + ': '+ str(accuracy))print('test loss is' + str(accuracy_test))
分享到:
举报财经168客户端下载

全部回复

0/140

投稿 您想发表你的观点和看法?

更多人气分析师

  • 张亦巧

    人气2200文章4145粉丝45

    暂无个人简介信息

  • 王启蒙现货黄金

    人气304文章3275粉丝8

    本人做分析师以来,并专注于贵金属投资市场,尤其是在现货黄金...

  • 指导老师

    人气1864文章4423粉丝52

    暂无个人简介信息

  • 李冉晴

    人气2320文章3821粉丝34

    李冉晴,专业现贷实盘分析师。

  • 梁孟梵

    人气2176文章3177粉丝39

    qq:2294906466 了解群指导添加微信mfmacd

  • 张迎妤

    人气1896文章3305粉丝34

    个人专注于行情技术分析,消息面解读剖析,给予您第一时间方向...

  • 金泰铬J

    人气2328文章3925粉丝51

    投资问答解咨询金泰铬V/信tgtg67即可获取每日的实时资讯、行情...

  • 金算盘

    人气2696文章7761粉丝125

    高级分析师,混过名校,厮杀于股市和期货、证券市场多年,专注...

  • 金帝财神

    人气4760文章8329粉丝119

    本文由资深分析师金帝财神微信:934295330,指导黄金,白银,...

FX168财经

FX168财经学院

FX168财经

FX168北美