基于遗传规划自动挖掘因子¶
摘要¶
本文基于聚宽提供的开源数据模块jqdatasdk获取标的的行情数据,使用gplearn构建了alpha因子表达式,并采用jqfactor_analyzer对构建的alpha因子进行了单因子分析,
为因子挖掘提供了一条思路。
虽然市面上有流行的alpha191和alpha101,但鲜有文献详细描述构建原理及方法。
本文主要起抛砖引玉的作用,希望了解这部分知识的同学能分享下相关内容,谢谢!
导入所需的模块¶
import numpy as np
import pandas as pd
import graphviz
from scipy.stats import rankdata
import pickle
from gplearn import genetic
from gplearn.functions import make_function
from gplearn.genetic import SymbolicTransformer, SymbolicRegressor
from gplearn.fitness import make_fitness
from sklearn.utils import check_random_state
from sklearn.model_selection import train_test_split
import jqdatasdk as jq
import jqfactor_analyzer as ja
jq.auth('手机号', '密码')
import warnings
warnings.filterwarnings("ignore")
使用jqdatasdk获取及处理数据¶
# 设置起止时间
start_date = '2019-01-01'
end_date = '2019-06-01'
fields = ['open', 'high', 'low', 'avg', 'pre_close', 'high_limit','low_limit', 'close']
stock_price = jq.get_price('000001.XSHE', start_date=start_date, end_date=end_date, fq='post', fields=fields)
stock_price['pct'] = stock_price['close'].pct_change(periods=1)
data = stock_price[fields].values
target = stock_price['pct'].values
test_size=0.2
test_num = int(len(data)*test_size)
X_train = data[:-test_num]
X_test = data[-test_num:]
y_train = np.nan_to_num(target[:-test_num])
y_test = np.nan_to_num(target[-test_num:])
定义函数¶
# 系统自带的函数群
"""
Available individual functions are:
‘add’ : addition, arity=2.
‘sub’ : subtraction, arity=2.
‘mul’ : multiplication, arity=2.
‘div’ : protected division where a denominator near-zero returns 1., arity=2.
‘sqrt’ : protected square root where the absolute value of the argument is used, arity=1.
‘log’ : protected log where the absolute value of the argument is used and a near-zero argument returns 0., arity=1.
‘abs’ : absolute value, arity=1.
‘neg’ : negative, arity=1.
‘inv’ : protected inverse where a near-zero argument returns 0., arity=1.
‘max’ : maximum, arity=2.
‘min’ : minimum, arity=2.
‘sin’ : sine (radians), arity=1.
‘cos’ : cosine (radians), arity=1.
‘tan’ : tangent (radians), arity=1.
"""
# init_function = ['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max', 'min', 'sin', 'cos', 'tan']
init_function = ['add', 'sub', 'mul', 'div']
# 自定义函数, make_function函数群
def _rolling_rank(data):
value = rankdata(data)[-1]
return value
def _rolling_prod(data):
return np.prod(data)
def _ts_sum(data):
window=10
value = np.array(pd.Series(data.flatten()).rolling(window).sum().tolist())
value = np.nan_to_num(value)
return value
def _sma(data):
window=10
value = np.array(pd.Series(data.flatten()).rolling(window).mean().tolist())
value = np.nan_to_num(value)
return value
def _stddev(data):
window=10
value = np.array(pd.Series(data.flatten()).rolling(window).std().tolist())
value = np.nan_to_num(value)
return value
def _ts_rank(data):
window=10
value = np.array(pd.Series(data.flatten()).rolling(10).apply(_rolling_rank).tolist())
value = np.nan_to_num(value)
return value
def _product(data):
window=10
value = np.array(pd.Series(data.flatten()).rolling(10).apply(_rolling_prod).tolist())
value = np.nan_to_num(value)
return value
def _ts_min(data):
window=10
value = np.array(pd.Series(data.flatten()).rolling(window).min().tolist())
value = np.nan_to_num(value)
return value
def _ts_max(data):
window=10
value = np.array(pd.Series(data.flatten()).rolling(window).max().tolist())
value = np.nan_to_num(value)
return value
def _delta(data):
value = np.diff(data.flatten())
value = np.append(0, value)
return value
def _delay(data):
period=1
value = pd.Series(data.flatten()).shift(1)
value = np.nan_to_num(value)
return value
def _rank(data):
value = np.array(pd.Series(data.flatten()).rank().tolist())
value = np.nan_to_num(value)
return value
def _scale(data):
k=1
data = pd.Series(data.flatten())
value = data.mul(1).div(np.abs(data).sum())
value = np.nan_to_num(value)
return value
def _ts_argmax(data):
window=10
value = pd.Series(data.flatten()).rolling(10).apply(np.argmax) + 1
value = np.nan_to_num(value)
return value
def _ts_argmin(data):
window=10
value = pd.Series(data.flatten()).rolling(10).apply(np.argmin) + 1
value = np.nan_to_num(value)
return value
# make_function函数群
delta = make_function(function=_delta, name='delta', arity=1)
delay = make_function(function=_delay, name='delay', arity=1)
rank = make_function(function=_rank, name='rank', arity=1)
scale = make_function(function=_scale, name='scale', arity=1)
sma = make_function(function=_sma, name='sma', arity=1)
stddev = make_function(function=_stddev, name='stddev', arity=1)
product = make_function(function=_product, name='product', arity=1)
ts_rank = make_function(function=_ts_rank, name='ts_rank', arity=1)
ts_min = make_function(function=_ts_min, name='ts_min', arity=1)
ts_max = make_function(function=_ts_max, name='ts_max', arity=1)
ts_argmax = make_function(function=_ts_argmax, name='ts_argmax', arity=1)
ts_argmin = make_function(function=_ts_argmin, name='ts_argmin', arity=1)
ts_sum = make_function(function=_ts_sum, name='ts_sum', arity=1)
user_function = [delta, delay, rank, scale, sma, stddev, product, ts_rank, ts_min, ts_max, ts_argmax, ts_argmin, ts_sum]
定义适应度¶
def _my_metric(y, y_pred, w):
value = np.sum(np.abs(y) + np.abs(y_pred))
return value
my_metric = make_fitness(function=_my_metric, greater_is_better=True)
生成表达式¶
generations = 5
function_set = init_function + user_function
metric = my_metric
population_size = 100
random_state=0
est_gp = SymbolicTransformer(
feature_names=fields,
function_set=function_set,
generations=generations,
metric=metric,
population_size=population_size,
tournament_size=20,
random_state=random_state,
)
est_gp.fit(X_train, y_train)
# 将模型保存到本地
with open('gp_model.pkl', 'wb') as f:
pickle.dump(est_gp, f)
# 获取较优的表达式
best_programs = est_gp._best_programs
best_programs_dict = {}
for p in best_programs:
factor_name = 'alpha_' + str(best_programs.index(p) + 1)
best_programs_dict[factor_name] = {'fitness':p.fitness_, 'expression':str(p), 'depth':p.depth_, 'length':p.length_}
best_programs_dict = pd.DataFrame(best_programs_dict).T
best_programs_dict = best_programs_dict.sort_values(by='fitness')
best_programs_dict
def alpha_factor_graph(num):
# 打印指定num的表达式图
factor = best_programs[num-1]
print(factor)
print('fitness: {0}, depth: {1}, length: {2}'.format(factor.fitness_, factor.depth_, factor.length_))
dot_data = factor.export_graphviz()
graph = graphviz.Source(dot_data)
graph.render('images/alpha_factor_graph', format='png', cleanup=True)
return graph
使用表达式构建因子(以alpha_10为例)¶
# 打印因子alpha_10的结构图
graph10 = alpha_factor_graph(10)
graph10
# alpha_10 delta(div(stddev(stddev(ts_max(avg))), stddev(ts_rank(ts_argmin(close)))))
def alpha_10(df):
value1 = _stddev(_stddev(_ts_max(np.array(df['avg'].tolist()))))
value2 = _stddev(_ts_rank(_ts_argmin(np.array(df['close'].tolist()))))
value = delta(value1/value2)
return value
stock_price['alpha_10'] = alpha_10(stock_price)
单因子分析构建的因子¶
# 设置起止时间
start_date = '2019-01-01'
end_date = '2019-06-01'
# 设置调仓周期
periods=(1, 5, 20)
# 设置分层数量
quantiles=5
# 设置股票池
scu='000300.XSHG'
securities = jq.get_index_stocks(scu)
# 获取需要分析的数据
factor_data = pd.DataFrame(columns=securities, index=stock_price.index)
for security in securities:
stock_price = jq.get_price(security, start_date=start_date, end_date=end_date, fq='post', fields=fields)
stock_price['pct'] = stock_price['close'].pct_change(periods=1)
stock_price['alpha_10'] = alpha_10(stock_price)
factor_data[security] = stock_price['alpha_10']
# 使用获取的因子值进行单因子分析
far = ja.analyze_factor(factor=factor_data,
weight_method='avg',
industry='jq_l1',
quantiles=quantiles,
periods=periods,
max_loss=0.25)
# 生成统计图表
far.create_full_tear_sheet(
demeaned=False, group_adjust=False, by_group=False,
turnover_periods=None, avgretplot=(5, 15), std_bar=False
)
参考文献¶
- jqdatasdk文档:https://www.joinquant.com/help/api/help?name=JQData
- jqfactor_analyzer文档:https://github.com/JoinQuant/jqfactor_analyzer
- 因子分析:https://www.joinquant.com/help/api/help?name=factor
- 单因子分析:https://ke.qq.com/course/394206
- gplearn文档:https://gplearn.readthedocs.io/en/stable/
- 华泰金工:基于遗传规划的选股因子挖掘——华泰人工智能系列之二十一
- 华泰金工:再探基于遗传规划的选股因子挖掘——华泰人工智能系列之二十三
- 广发证券:基于遗传规划的智能交易策略方法———另类交易策略系列之九
- 申万宏源:寻找不一样的ALPHA-遗传算法和遗传规划