近期在研究利用机器学习和深度学习进行策略开发,试图探索市场深层规律。
写了一个简单的代码框架,包括数据获取,数据处理,特征提取,利用机器学习和深度学习算法分类。代码最后的分类部分只做了部分调参工作,在此基础上进一步细化也比较容易。
本文数据是随意获取的,重在方法和思路,不重结论,因为得到有用的结论需要在数据构造、特征提取以及最后的分类调参上做大量细化工作,而且真正有用的结论是核心秘密,一般也不会分享。
除了第一部分数据获取要利用聚宽平台的数据库,其他模块建议在自己机器上跑,数据存储和读取的模块也已经写好了,直接复制到本地就能跑。
特征选择部分的详细介绍可以参考上一篇
希望能对刚接触机器学习的小伙伴有一点儿作用。
import xlrdfrom jqfactor import *import pandas as pdimport numpy as npfrom multiprocessing.dummy import Pool as ThreadPoolfrom jqdata import *import time
start_clock = time.clock()ExcellFile = xlrd.open_workbook('FactorTable.xlsx')name = ExcellFile.sheet_names()sheet = ExcellFile.sheet_by_name(name[0])factor_quality = list(sheet.col_values(1))factor_fundamental = list(sheet.col_values(2))[:28]factor_mood = list(sheet.col_values(3)[:35])factor_growth = list(sheet.col_values(4))[:8]factor_risk=list(sheet.col_values(5))[:12]factor_stock=list(sheet.col_values(6))[:15]
stocks = ['000001.XSHE']start_date = '2015-01-01'end_date = '2018-11-08'factor_list=factor_quality+factor_fundamental+factor_mood+factor_growth+factor_risk+factor_stocktrade_date_list = get_trade_days(start_date,end_date)
def get_jq_factor_values():factor_dict = get_factor_values(stocks,factor_list,start_date=start_date,end_date=end_date)return factor_dictres = get_jq_factor_values()
def get_raw_factor_dict():pool = ThreadPool(processes = len(trade_date_list))frame_list = pool.map(get_jq_factor_values, trade_date_list)pool.close()pool.join()return frame_list
import picklep_file = open('all_factors.pkl','wb')pickle.dump(res,p_file)p_file.close()
def get_day_profit(stocks,end_date,start_date=None,count=-1,pre_num=1):''' 获取每天的收益率 input: stocks:list or Series,股票代码 start_date:开始时间 end_date:结束时间 count:与start_date二选一,向前取值个数 pre_num:int,向前计算的天数 output: profit:dataframe,index为日期,values为收益率,收益率大于0标记为1,否则为0 '''if count == -1:price = get_price(stocks,start_date,end_date,fields=['close'])['close']else:price = get_price(stocks,end_date=end_date,count=count,fields=['close'])['close']profit = price.pct_change(periods=pre_num).dropna()profit[profit > 0] = 1profit[profit < 0] = 0profit.columns=['profit_class']return profitprofit = get_day_profit(stocks,end_date,start_date)
p_file = open('profit.pkl','wb')pickle.dump(profit,p_file)p_file.close()
import xlrdimport pickleimport pandas as pdimport numpy as npfrom sklearn.feature_selection import SelectKBest,SelectPercentile,SelectFromModel,chi2,f_classif,mutual_info_classif,RFEfrom scipy.stats import pearsonrfrom sklearn.ensemble import RandomForestRegressor,RandomForestClassifierfrom sklearn.svm import SVC,LinearSVC,LinearSVR,SVRfrom sklearn.tree import DecisionTreeClassifierimport lightgbm as lgbfrom sklearn.model_selection import train_test_splitimport gcimport pickle
with open('all_factors.pkl','rb') as pl_file:frame_list = pickle.load(pl_file)factor_df = pd.concat(frame_list,axis=1)with open('profit.pkl','rb') as pl_file:profit = pickle.load(pl_file)profit_shift = profit.shift(-1)data = pd.concat([factor_df,profit_shift],axis=1)
data_dn1 = data.dropna(axis=1,how='all')data_dn0 = data_dn1.dropna(how='all')data_dn = data_dn0.fillna(data_dn0.mean())data_df = data_dn.drop([data_dn.index[0],data_dn.index[-1]])columns = data_df.columnsdata_x = data_df[columns[:-1]]data_y = data_df[columns[-1]]
def winsorize_and_standarlize(data,qrange=[0.05,0.95],axis=0):''' input: data:Dataframe or series,输入数据 qrange:list,list[0]下分位数,list[1],上分位数,极值用分位数代替 '''if isinstance(data,pd.DataFrame):if axis == 0:q_down = data.quantile(qrange[0])q_up = data.quantile(qrange[1])index = data.indexcol = data.columnsfor n in col:data[n][data[n] > q_up[n]] = q_up[n]data[n][data[n] < q_down[n]] = q_down[n]data = (data - data.mean())/data.std()data = data.fillna(0)else:data = data.stack()data = data.unstack(0)q = data.quantile(qrange)index = data.indexcol = data.columnsfor n in col:data[n][data[n] > q[n]] = q[n]data = (data - data.mean())/data.std()data = data.stack().unstack(0)data = data.fillna(0)elif isinstance(data,pd.Series):name = data.nameq = data.quantile(qrange)data[data>q] = qdata = (data - data.mean())/data.std()return datadata_ws = winsorize_and_standarlize(data_x)
class FeatureSelection():''' 特征选择: identify_collinear:基于相关系数,删除小于correlation_threshold的特征 identify_importance_lgbm:基于LightGBM算法,得到feature_importance,选择和大于p_importance的特征 filter_select:单变量选择,指定k,selectKBest基于method提供的算法选择前k个特征,selectPercentile选择前p百分百的特征 wrapper_select:RFE,基于estimator递归特征消除,保留n_feature_to_select个特征 '''def __init__(self):self.supports_filter = None #bool型,特征是否被选中self.supports_wrapper = Noneself.supports_embedded = Noneself.columns_lgbm = None #选择的特征self.columns_filter = Noneself.columns_wrapper = Noneself.columns_embedded = Noneself.record_collinear = None #自相关矩阵大于门限值def identify_collinear(self, data, correlation_threshold):columns = data.columnsself.correlation_threshold = correlation_threshold# Calculate the correlations between every columncorr_matrix = data.corr()self.corr_matrix = corr_matrix# Extract the upper triangle of the correlation matrixupper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(np.bool))# Select the features with correlations above the threshold# Need to use the absolute valueto_drop = [column for column in upper.columns if any(upper[column].abs() > correlation_threshold)]obtain_columns = [column for column in columns if column not in to_drop]self.columns = obtain_columns# Dataframe to hold correlated pairsrecord_collinear = pd.DataFrame(columns = ['drop_feature', 'corr_feature', 'corr_value'])# Iterate through the columns to dropfor column in to_drop:# Find the correlated featurescorr_features = list(upper.index[upper[column].abs() > correlation_threshold])# Find the correlated valuescorr_values = list(upper[column][upper[column].abs() > correlation_threshold])drop_features = [column for _ in range(len(corr_features))] # Record the information (need a temp df for now)temp_df = pd.DataFrame.from_dict({'drop_feature': drop_features, 'corr_feature': corr_features, 'corr_value': corr_values})# Add to dataframerecord_collinear = record_collinear.append(temp_df, ignore_index = True)self.record_collinear = record_collinearreturn data[obtain_columns] def identify_importance_lgbm(self, features, labels,p_importance=0.8, eval_metric='auc', task='classification', n_iterations=10, early_stopping = True):# One hot encodingdata = featuresfeatures = pd.get_dummies(features)# Extract feature namesfeature_names = list(features.columns)# Convert to np arrayfeatures = np.array(features)labels = np.array(labels).reshape((-1, ))# Empty array for feature importancesfeature_importance_values = np.zeros(len(feature_names))print('Training Gradient Boosting Model\n')# Iterate through each foldfor _ in range(n_iterations):if task == 'classification':model = lgb.LGBMClassifier(n_estimators=100, learning_rate = 0.05, verbose = -1)elif task == 'regression':model = lgb.LGBMRegressor(n_estimators=100, learning_rate = 0.05, verbose = -1)else:raise ValueError('Task must be either "classification" or "regression"')# If training using early stopping need a validation setif early_stopping:train_features, valid_features, train_labels, valid_labels = train_test_split(features, labels, test_size = 0.15)# Train the model with early stoppingmodel.fit(train_features, train_labels, eval_metric = eval_metric, eval_set = [(valid_features, valid_labels)], verbose = -1)# Clean up memorygc.enable()del train_features, train_labels, valid_features, valid_labelsgc.collect()else:model.fit(features, labels)# Record the feature importancesfeature_importance_values += model.feature_importances_ / n_iterationsfeature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})# Sort features according to importancefeature_importances = feature_importances.sort_values('importance', ascending = False).reset_index(drop = True)# Normalize the feature importances to add up to onefeature_importances['normalized_importance'] = feature_importances['importance'] / feature_importances['importance'].sum()feature_importances['cumulative_importance'] = np.cumsum(feature_importances['normalized_importance'])select_df = feature_importances[feature_importances['cumulative_importance']<=p_importance]select_columns = select_df['feature']self.columns_lgbm = list(select_columns.values)res = data[self.columns]return resdef filter_select(self, data_x, data_y, k=None, p=50,method=f_classif):columns = data_x.columnsif k != None:model = SelectKBest(method,k)res = model.fit_transform(data_x,data_y)supports = model.get_support()else:model = SelectPercentile(method,p)res = model.fit_transform(data_x,data_y)supports = model.get_support()self.supports_filter = supportsself.columns_filter = columns[supports]return resdef wrapper_select(self,data_x,data_y,n,estimator):columns = data_x.columnsmodel = RFE(estimator=estimator,n_features_to_select=n)res = model.fit_transform(data_x,data_y)supports = model.get_support() #标识被选择的特征在原数据中的位置self.supports_wrapper = supportsself.columns_wrapper = columns[supports]return resdef embedded_select(self,data_x,data_y,estimator,threshold=None):columns = data_x.columnsmodel = SelectFromModel(estimator=estimator,prefit=False,threshold=threshold)res = model.fit_transform(data_x,data_y)supports = model.get_support()self.supports_embedded = supportsself.columns_embedded = columns[supports]return res
f = FeatureSelection()data_ncol = f.identify_collinear(data_ws,0.8)data_lgbm = f.identify_importance_lgbm(data_ncol,data_y.values)data_filter = f.filter_select(data_ncol,data_y)estimator = LinearSVC()n=int(0.5*len(data_ncol.columns))data_wrapper = f.wrapper_select(data_ncol,data_y,n,estimator)estimator_embedded = LinearSVC(penalty='l1')data_embedded = f.embedded_select(data_ncol,data_y,estimator)l = [data_x,data_y,data_ws,data_ncol,data_lgbm,data_filter,data_wrapper,data_embedded]
with open('feature_selection_res.plckle','wb') as pk_file:pickle.dump(l,pk_file)
import pandas as pdimport numpy as npimport picklefrom sklearn.svm import SVC,LinearSVCfrom sklearn.ensemble import RandomForestClassifierimport lightgbm as lgb from sklearn.model_selection import train_test_split,GridSearchCVimport tensorflow as tfimport warningswarnings.filterwarnings('ignore')
with open('feature_selection_res.plckle','rb') as pk_file:pk_data = pickle.load(pk_file)data_x = pk_data[0]data_y = pk_data[1]data_ws = pk_data[2]data_ncol = pk_data[3]data_lgbm = pk_data[4]data_filter = pk_data[5]data_wrapper = pk_data[6]data_embedded = pk_data[7]
lgbm_train_x,lgbm_test_x,lgbm_train_y,lgbm_test_y = train_test_split(data_lgbm,data_y,test_size=0.3)embedded_train_x,embedded_test_x,embedded_train_y,embedded_test_y = train_test_split(data_embedded,data_y,test_size=0.3)wrapper_train_x,wrapper_test_x,wrapper_train_y,wrapper_test_y = train_test_split(data_wrapper,data_y,test_size=0.3)
lgbm_svc = SVC(max_iter=1000)lgbm_svc_params = {'C':[1,1.5,1.8,2],'kernel':['rbf','linear','poly','sigmoid']}lgbm_svc_model = GridSearchCV(estimator=lgbm_svc,param_grid=lgbm_svc_params,scoring='roc_auc')lgbm_svc_model.fit(lgbm_train_x,lgbm_train_y)print(lgbm_svc_model.best_score_)print(lgbm_svc_model.best_params_)
embedded_svc = SVC(max_iter=100)embedded_svc_params = {'C':[0.5,1,1.5,2,2.5],'kernel':['rbf','linear','poly','sigmoid']}embedded_svc_model = GridSearchCV(estimator=embedded_svc,param_grid=embedded_svc_params,scoring='roc_auc')embedded_svc_model.fit(embedded_train_x,embedded_train_y)print(embedded_svc_model.best_score_)print(embedded_svc_model.best_params_)
lgbm_rf = RandomForestClassifier(max_depth=5)lgbm_rf_params = {'n_estimators':range(5,15,3),'min_samples_split':range(2,20,2)}lgbm_rf_model = GridSearchCV(estimator=lgbm_rf,param_grid=lgbm_rf_params,scoring='roc_auc')lgbm_rf_model.fit(lgbm_train_x,lgbm_train_y)print(lgbm_rf_model.best_score_)print(lgbm_rf_model.best_params_)
keep_prob = 0.5LR = 0.1l1_num = 30l2_num = 1batch_size = 10times = 1000v1 = np.array(lgbm_train_y.values)lgbm_train_y_a = v1.reshape((-1,1))v2 = np.array(lgbm_test_y.values)lgbm_test_y_a = v2.reshape((-1,1))n_features = len(lgbm_test_x.columns)x = tf.placeholder(tf.float32,[None,n_features])y = tf.placeholder(tf.float32,[None,1])def weights(shape):weights = tf.Variable(tf.truncated_normal(shape,stddev=0.1))return weightsdef biases(shape):biases = tf.zeros(shape) + 0.1return tf.Variable(biases)weights1 = weights([n_features,l1_num])biases1 = biases([1,l1_num])w_plus_b1 = tf.matmul(x,weights1) + biases1l1 = tf.nn.relu(w_plus_b1)l1_dropout = tf.nn.dropout(l1,keep_prob=keep_prob)weight2 = weights([l1_num,l2_num])biases2 = biases([l2_num])w_plus_b2 = tf.matmul(l1_dropout,weight2) + biases2l2 = tf.nn.relu(w_plus_b2)l2_dropout = tf.nn.dropout(l2,keep_prob=keep_prob)weight3 = weights([l2_num,1])biases3 = biases([1])w_plus_b3 = tf.matmul(l2_dropout,weight3) + biases3l3 = tf.nn.relu(w_plus_b3)res = tf.nn.sigmoid(l3)loss = tf.reduce_mean(tf.square(y - res))train_steps = tf.train.AdadeltaOptimizer(LR).minimize(loss)initialize = tf.global_variables_initializer()with tf.Session() as sess:sess.run(initialize)for batch in range(batch_size):for time in range(times):sess.run(train_steps,feed_dict={x:lgbm_train_x,y:lgbm_train_y_a})accuracy = sess.run(loss,feed_dict={x:lgbm_train_x,y:lgbm_train_y_a})accuracy_test = sess.run(loss,feed_dict={x:lgbm_test_x,y:lgbm_test_y_a})print('train loss is '+ str(batch) + ': '+ str(accuracy))print('test loss is' + str(accuracy_test))
本社区仅针对特定人员开放
查看需注册登录并通过风险意识测评
5秒后跳转登录页面...
移动端课程