本文基于聚宽提供的开源数据模块jqdatasdk获取标的的行情数据,使用gplearn构建了alpha因子表达式,并采用jqfactor_analyzer对构建的alpha因子进行了单因子分析,
为因子挖掘提供了一条思路。
虽然市面上有流行的alpha191和alpha101,但鲜有文献详细描述构建原理及方法。
本文主要起抛砖引玉的作用,希望了解这部分知识的同学能分享下相关内容,谢谢!
import numpy as np
import pandas as pd
import graphviz
from scipy.stats import rankdata
import pickle
from gplearn import genetic
from gplearn.functions import make_function
from gplearn.genetic import SymbolicTransformer, SymbolicRegressor
from gplearn.fitness import make_fitness
from sklearn.utils import check_random_state
from sklearn.model_selection import train_test_split
import jqdatasdk as jq
import jqfactor_analyzer as ja
jq.auth('手机号', '密码')
import warnings
warnings.filterwarnings("ignore")
auth success
# 设置起止时间
start_date = '2019-01-01'
end_date = '2019-06-01'
fields = ['open', 'high', 'low', 'avg', 'pre_close', 'high_limit','low_limit', 'close']
stock_price = jq.get_price('000001.XSHE', start_date=start_date, end_date=end_date, fq='post', fields=fields)
stock_price['pct'] = stock_price['close'].pct_change(periods=1)
data = stock_price[fields].values
target = stock_price['pct'].values
test_size=0.2
test_num = int(len(data)*test_size)
X_train = data[:-test_num]
X_test = data[-test_num:]
y_train = np.nan_to_num(target[:-test_num])
y_test = np.nan_to_num(target[-test_num:])
# 系统自带的函数群
"""
Available individual functions are:
‘add’ : addition, arity=2.
‘sub’ : subtraction, arity=2.
‘mul’ : multiplication, arity=2.
‘div’ : protected division where a denominator near-zero returns 1., arity=2.
‘sqrt’ : protected square root where the absolute value of the argument is used, arity=1.
‘log’ : protected log where the absolute value of the argument is used and a near-zero argument returns 0., arity=1.
‘abs’ : absolute value, arity=1.
‘neg’ : negative, arity=1.
‘inv’ : protected inverse where a near-zero argument returns 0., arity=1.
‘max’ : maximum, arity=2.
‘min’ : minimum, arity=2.
‘sin’ : sine (radians), arity=1.
‘cos’ : cosine (radians), arity=1.
‘tan’ : tangent (radians), arity=1.
"""
# init_function = ['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max', 'min', 'sin', 'cos', 'tan']
init_function = ['add', 'sub', 'mul', 'div']
# 自定义函数, make_function函数群
def _rolling_rank(data):
value = rankdata(data)[-1]
return value
def _rolling_prod(data):
return np.prod(data)
def _ts_sum(data):
window=10
value = np.array(pd.Series(data.flatten()).rolling(window).sum().tolist())
value = np.nan_to_num(value)
return value
def _sma(data):
window=10
value = np.array(pd.Series(data.flatten()).rolling(window).mean().tolist())
value = np.nan_to_num(value)
return value
def _stddev(data):
window=10
value = np.array(pd.Series(data.flatten()).rolling(window).std().tolist())
value = np.nan_to_num(value)
return value
def _ts_rank(data):
window=10
value = np.array(pd.Series(data.flatten()).rolling(10).apply(_rolling_rank).tolist())
value = np.nan_to_num(value)
return value
def _product(data):
window=10
value = np.array(pd.Series(data.flatten()).rolling(10).apply(_rolling_prod).tolist())
value = np.nan_to_num(value)
return value
def _ts_min(data):
window=10
value = np.array(pd.Series(data.flatten()).rolling(window).min().tolist())
value = np.nan_to_num(value)
return value
def _ts_max(data):
window=10
value = np.array(pd.Series(data.flatten()).rolling(window).max().tolist())
value = np.nan_to_num(value)
return value
def _delta(data):
value = np.diff(data.flatten())
value = np.append(0, value)
return value
def _delay(data):
period=1
value = pd.Series(data.flatten()).shift(1)
value = np.nan_to_num(value)
return value
def _rank(data):
value = np.array(pd.Series(data.flatten()).rank().tolist())
value = np.nan_to_num(value)
return value
def _scale(data):
k=1
data = pd.Series(data.flatten())
value = data.mul(1).div(np.abs(data).sum())
value = np.nan_to_num(value)
return value
def _ts_argmax(data):
window=10
value = pd.Series(data.flatten()).rolling(10).apply(np.argmax) + 1
value = np.nan_to_num(value)
return value
def _ts_argmin(data):
window=10
value = pd.Series(data.flatten()).rolling(10).apply(np.argmin) + 1
value = np.nan_to_num(value)
return value
# make_function函数群
delta = make_function(function=_delta, name='delta', arity=1)
delay = make_function(function=_delay, name='delay', arity=1)
rank = make_function(function=_rank, name='rank', arity=1)
scale = make_function(function=_scale, name='scale', arity=1)
sma = make_function(function=_sma, name='sma', arity=1)
stddev = make_function(function=_stddev, name='stddev', arity=1)
product = make_function(function=_product, name='product', arity=1)
ts_rank = make_function(function=_ts_rank, name='ts_rank', arity=1)
ts_min = make_function(function=_ts_min, name='ts_min', arity=1)
ts_max = make_function(function=_ts_max, name='ts_max', arity=1)
ts_argmax = make_function(function=_ts_argmax, name='ts_argmax', arity=1)
ts_argmin = make_function(function=_ts_argmin, name='ts_argmin', arity=1)
ts_sum = make_function(function=_ts_sum, name='ts_sum', arity=1)
user_function = [delta, delay, rank, scale, sma, stddev, product, ts_rank, ts_min, ts_max, ts_argmax, ts_argmin, ts_sum]
def _my_metric(y, y_pred, w):
value = np.sum(np.abs(y) + np.abs(y_pred))
return value
my_metric = make_fitness(function=_my_metric, greater_is_better=True)
generations = 5
function_set = init_function + user_function
metric = my_metric
population_size = 100
random_state=0
est_gp = SymbolicTransformer(
feature_names=fields,
function_set=function_set,
generations=generations,
metric=metric,
population_size=population_size,
tournament_size=20,
random_state=random_state,
)
est_gp.fit(X_train, y_train)
# 将模型保存到本地
with open('gp_model.pkl', 'wb') as f:
pickle.dump(est_gp, f)
# 获取较优的表达式
best_programs = est_gp._best_programs
best_programs_dict = {}
for p in best_programs:
factor_name = 'alpha_' + str(best_programs.index(p) + 1)
best_programs_dict[factor_name] = {'fitness':p.fitness_, 'expression':str(p), 'depth':p.depth_, 'length':p.length_}
best_programs_dict = pd.DataFrame(best_programs_dict).T
best_programs_dict = best_programs_dict.sort_values(by='fitness')
best_programs_dict
depth | expression | fitness | length | |
---|---|---|---|---|
alpha_10 | 5 | delta(div(stddev(stddev(ts_max(avg))), stddev(... | 325.906 | 10 |
alpha_9 | 6 | sub(ts_rank(delay(mul(delta(sma(open)), div(ts... | 413.386 | 18 |
alpha_8 | 1 | delta(pre_close) | 1920.03 | 2 |
alpha_7 | 3 | delta(delay(delay(open))) | 2802.3 | 4 |
alpha_5 | 3 | rank(ts_rank(div(high, pre_close))) | 3241.39 | 5 |
alpha_6 | 3 | rank(ts_rank(ts_argmin(high_limit))) | 3241.4 | 4 |
alpha_4 | 6 | mul(delay(mul(rank(ts_argmin(delta(low))), ts_... | 19240.9 | 20 |
alpha_3 | 4 | product(ts_argmin(product(ts_rank(pre_close)))) | 7.74627e+10 | 5 |
alpha_2 | 2 | product(delta(close)) | 1.31043e+15 | 3 |
alpha_1 | 5 | mul(product(sub(open, ts_sum(close))), ts_max(... | 2.53308e+50 | 14 |
def alpha_factor_graph(num):
# 打印指定num的表达式图
factor = best_programs[num-1]
print(factor)
print('fitness: {0}, depth: {1}, length: {2}'.format(factor.fitness_, factor.depth_, factor.length_))
dot_data = factor.export_graphviz()
graph = graphviz.Source(dot_data)
graph.render('images/alpha_factor_graph', format='png', cleanup=True)
return graph
# 打印因子alpha_10的结构图
graph10 = alpha_factor_graph(10)
graph10
delta(div(stddev(stddev(ts_max(avg))), stddev(ts_rank(ts_argmin(close))))) fitness: 325.9056451860095, depth: 5, length: 10
# alpha_10 delta(div(stddev(stddev(ts_max(avg))), stddev(ts_rank(ts_argmin(close)))))
def alpha_10(df):
value1 = _stddev(_stddev(_ts_max(np.array(df['avg'].tolist()))))
value2 = _stddev(_ts_rank(_ts_argmin(np.array(df['close'].tolist()))))
value = delta(value1/value2)
return value
stock_price['alpha_10'] = alpha_10(stock_price)
# 设置起止时间
start_date = '2019-01-01'
end_date = '2019-06-01'
# 设置调仓周期
periods=(1, 5, 20)
# 设置分层数量
quantiles=5
# 设置股票池
scu='000300.XSHG'
securities = jq.get_index_stocks(scu)
# 获取需要分析的数据
factor_data = pd.DataFrame(columns=securities, index=stock_price.index)
for security in securities:
stock_price = jq.get_price(security, start_date=start_date, end_date=end_date, fq='post', fields=fields)
stock_price['pct'] = stock_price['close'].pct_change(periods=1)
stock_price['alpha_10'] = alpha_10(stock_price)
factor_data[security] = stock_price['alpha_10']
# 使用获取的因子值进行单因子分析
far = ja.analyze_factor(factor=factor_data,
weight_method='avg',
industry='jq_l1',
quantiles=quantiles,
periods=periods,
max_loss=0.25)
# 生成统计图表
far.create_full_tear_sheet(
demeaned=False, group_adjust=False, by_group=False,
turnover_periods=None, avgretplot=(5, 15), std_bar=False
)
分位数统计
min | max | mean | std | count | count % | |
---|---|---|---|---|---|---|
factor_quantile | ||||||
1.0 | -209.188979 | 0.337907 | -1.892000 | 8.576601 | 4140 | 20.003865 |
2.0 | -8.521210 | 0.845769 | -0.256171 | 0.989321 | 4139 | 19.999034 |
3.0 | -3.770525 | 2.599885 | -0.023692 | 0.507251 | 4138 | 19.994202 |
4.0 | -1.505961 | 8.112976 | 0.201420 | 0.787316 | 4139 | 19.999034 |
5.0 | -0.439981 | 196.023430 | 1.565537 | 6.509158 | 4140 | 20.003865 |
------------------------- 收益分析
period_1 | period_5 | period_20 | |
---|---|---|---|
Ann. alpha | 0.429 | 0.658 | 0.082 |
beta | -0.040 | 0.000 | 0.136 |
Mean Period Wise Return Top Quantile (bps) | 29.506 | 26.815 | 23.671 |
Mean Period Wise Return Bottom Quantile (bps) | 24.146 | 22.783 | 23.058 |
Mean Period Wise Spread (bps) | 5.360 | 3.856 | 0.444 |
<matplotlib.figure.Figure at 0x28bb44dc860>
<matplotlib.figure.Figure at 0x28bb45879e8>
<matplotlib.figure.Figure at 0x28bb6875a90>
<matplotlib.figure.Figure at 0x28bb6df39e8>
------------------------- IC 分析
period_1 | period_5 | period_20 | |
---|---|---|---|
IC Mean | -0.003 | 0.025 | 0.012 |
IC Std. | 0.114 | 0.127 | 0.098 |
IR | -0.024 | 0.194 | 0.118 |
t-stat(IC) | -0.200 | 1.613 | 0.980 |
p-value(IC) | 0.842 | 0.111 | 0.331 |
IC Skew | 0.627 | 0.127 | 0.220 |
IC Kurtosis | -0.248 | -0.725 | -0.707 |
<matplotlib.figure.Figure at 0x28bb6d4be80>
<matplotlib.figure.Figure at 0x28bb709de10>
<matplotlib.figure.Figure at 0x28bb8c150b8>
------------------------- 换手率分析
period_1 | period_20 | period_5 | |
---|---|---|---|
Quantile 1 Mean Turnover | 0.408 | 0.717 | 0.765 |
Quantile 2 Mean Turnover | 0.531 | 0.783 | 0.802 |
Quantile 3 Mean Turnover | 0.484 | 0.686 | 0.608 |
Quantile 4 Mean Turnover | 0.526 | 0.775 | 0.804 |
Quantile 5 Mean Turnover | 0.360 | 0.701 | 0.760 |
period_1 | period_5 | period_20 | |
---|---|---|---|
Mean Factor Rank Autocorrelation | 0.546 | -0.212 | -0.013 |
<matplotlib.figure.Figure at 0x28bb68f29e8>
<matplotlib.figure.Figure at 0x28bb6f99f98>
-------------------------
<matplotlib.figure.Figure at 0x28bb71177b8>
本社区仅针对特定人员开放
查看需注册登录并通过风险意识测评
5秒后跳转登录页面...
移动端课程