from jqfactor import Factor, calc_factors
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
stock = get_index_stocks('000300.XSHG')

class Hs300Alpha(Factor):
    # 设置因子名称
    name = 'hs300_alpha'
    # 设置获取数据的时间窗口长度
    max_window = 10
    # 设置依赖的数据
    dependencies = ['close']

    # 计算因子的函数， 需要返回一个 pandas.Series, index 是股票代码，value 是因子值
    def calc(self, data):
        # 获取个股的收盘价数据
        close = data['close']
        # 计算个股近10日收益
        stock_return = close.iloc[-1,:]/close.iloc[0,:] -1
        # 获取指数（沪深300）的收盘价数据
        index_close = self._get_extra_data(securities=['000300.XSHG'], fields=['close'])['close']
        # 计算指数的近10日收益
        index_return = index_close.iat[-1,0]/index_close.iat[0,0] - 1
        # 计算 alpha
        alpha = stock_return - index_return
        return alpha
factors = calc_factors(stock, [Hs300Alpha()], start_date='2015-01-01', end_date='2017-12-31')

data=factors['hs300_alpha']

#处理缺失值
output=pd.DataFrame()
for i in range(300):
    p=sum(data.iloc[:,i].isnull())/len(data.iloc[:,i])
    if p<0.2:
        data.iloc[:,i].fillna(mean(data.iloc[:,i]))
        output[i]=data.iloc[:,i]
output.columns=data.columns[output.columns]
output=output.fillna(mean(data))

#异常值处理
for i in range(len(output.columns)):
    MAD=median(abs(output.iloc[:,i]-median(output.iloc[:,i])))
    MAX=median(output.iloc[:,i])+3*1.4826*MAD
    MIN=median(output.iloc[:,i])-3*1.4826*MAD
    output.iloc[:,i][output.iloc[:,1]>MAX]=MAX
    output.iloc[:,i][output.iloc[:,1]<MIN]=MIN

#标准化
for i in range(len(output.columns)):
    output.iloc[:,i]=(output.iloc[:,i]-mean(output.iloc[:,i]))/std(output.iloc[:,i])

#获得行业哑变量矩阵
from jqdata import *
sw=get_industries(name='sw_l1').index
industry=pd.DataFrame(0,columns=output.columns,index=range(0,28))
for i in range(len(sw)):
    temp=list(set(output.columns).intersection(set(get_industry_stocks(sw[i]))))
    industry.loc[i,temp]=1

#去除市值、行业因素，得到新的因子值 
newx=pd.DataFrame()
for i in range(len(output.index)):
    m= get_fundamentals(query(valuation.circulating_cap,valuation.code).filter(valuation.code.in_(output.columns)), date=output.index[i])
    m.index=np.array(m['code'])
    m=m.iloc[:,0]
    m=(m-mean(m))/std(m)
    x=output.iloc[i,:]
    conc=pd.concat([x,m,industry.T],axis=1).fillna(mean(m))
    est=sm.OLS(conc.iloc[:,0],conc.iloc[:,1:]).fit()
    y_fitted = est.fittedvalues
    newx[i]=est.resid
newx=newx.T
newx.index=output.index
newx=newx.iloc[1:,:]
#看图
'''
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(conc.iloc[:,1],conc.iloc[:,0], 'o', label='data')
ax.plot(conc.iloc[:,1], y_fitted, 'r--.',label='OLS')
'''

"\nfig, ax = plt.subplots(figsize=(8,6))\nax.plot(conc.iloc[:,1],conc.iloc[:,0], 'o', label='data')\nax.plot(conc.iloc[:,1], y_fitted, 'r--.',label='OLS')\n"

#将因子值和y值匹配
output=output.iloc[:-1,:]
df = get_price(list(output.columns), start_date='2017-01-01', end_date='2017-12-31', frequency='daily', fields=['close'])
y=df['close'].diff()/np.array(df['close'])
y=y.iloc[1:,:]
y=y.fillna(mean(y))
y = y.drop((y.index).difference(newx.index))

#做回归 求回归系数
f=[0]*len(y.index)
t=[0]*len(y.index)
for i in range(len(y.index)):
    rlm_model = sm.RLM(y.iloc[i,:], newx.iloc[i,:], M=sm.robust.norms.HuberT()).fit()
    f[i]=float(rlm_model.params)
    t[i]=float(rlm_model.tvalues)
    '''
    #对回归的结果画图
    y_fit=rlm_model.fittedvalues
    fig, ax = plt.subplots(figsize=(8,6))
    ax.plot(newx,y, 'o', label='data')
    ax.plot(newx, y_fit, 'r--.',label='OLS')
    '''

#检验结果
#因子收益序列>0的概率
sum(pd.Series(f)>0)/len(f) 
#t值绝对值的均值---回归假设检验的t值
mean(abs(pd.Series(t))) 
#t值绝对值大于等于2的概率---回归假设检验的t值
sum(abs(pd.Series(t))>2)/len(t) 
#计算IC值序列
IC=[0]*len(y.columns)
for i in range(len(y.columns)):
    IC[i]=corrcoef(pd.Series(f).rank(),y.iloc[:,i].rank())[0,1]
#计算IC值的均值
mean(IC) 
#计算IC值的标准差
std(IC)
#IC大于0的比例
sum(pd.Series(IC)>0)/len(IC)
#IC绝对值大于0.02的比例
sum(pd.Series(IC)>0.02)/len(IC)
#IR值
mean(IC)/std(IC)

-0.0032439286775737546

#因子收益时间序列图
plt.bar(range(len(f)),f)

<Container object of 242 artists>

#因子收益分布直方图
plt.hist(f)

(array([  1.,   2.,   4.,  21.,  41.,  91.,  56.,  20.,   4.,   2.]),
 array([-0.00606028, -0.00500213, -0.00394397, -0.00288582, -0.00182767,
        -0.00076951,  0.00028864,  0.00134679,  0.00240495,  0.0034631 ,
         0.00452125]),
 <a list of 10 Patch objects>)

#回归因子收益t值绝对值
plt.bar(range(len(t)),abs(pd.Series(t)))

<Container object of 228 artists>

#因子IC值序列
plt.bar(range(len(IC)),pd.Series(IC))

<Container object of 290 artists>

#做分层回溯法
newy=df['close']/df['close'].iloc[0,:]-1
newy=newy.iloc[1:,:]
newy=newy.fillna(mean(y))
newy = newy.drop((newy.index).difference(newx.index))
fc1=[0]*len(newx.index)
fc2=[0]*len(newx.index)
fc3=[0]*len(newx.index)
fc4=[0]*len(newx.index)
fc5=[0]*len(newx.index)
for i in range(len(newx.index)):
    d1=newx.iloc[i,:][newx.iloc[i,:].rank()<=(len(newx.index)/5)].index
    d2=newx.iloc[i,:][newx.iloc[i,:].rank()<=(len(newx.index)/5*2) ].index
    d2=(d2).difference(d1)
    d3=newx.iloc[i,:][newx.iloc[i,:].rank()<=(len(newx.index)/5*3) ].index
    d3=(d3).difference(d2)
    d4=newx.iloc[i,:][newx.iloc[i,:].rank()<=(len(newx.index)/5*4) ].index
    d4=(d4).difference(d3)
    d5=newx.iloc[i,:][newx.iloc[i,:].rank()<=(len(newx.index)/5*5) ].index
    d5=(d5).difference(d4)
    mean1=mean(newy.iloc[i,:][d1])
    mean2=mean(newy.iloc[i,:][d2])
    mean3=mean(newy.iloc[i,:][d3])
    mean4=mean(newy.iloc[i,:][d4])
    mean5=mean(newy.iloc[i,:][d5])
    fc1[i]=mean1
    fc2[i]=mean2
    fc3[i]=mean3
    fc4[i]=mean4
    fc5[i]=mean5
fcsum=pd.DataFrame(fc1)
fcsum[1]=fc2
fcsum[2]=fc3
fcsum[3]=fc4
fcsum[4]=fc5

#画图
plt.plot(fcsum)

[<matplotlib.lines.Line2D at 0x7f1b959abc50>,
 <matplotlib.lines.Line2D at 0x7f1b959abe10>,
 <matplotlib.lines.Line2D at 0x7f1b959abfd0>,
 <matplotlib.lines.Line2D at 0x7f1b959b3240>,
 <matplotlib.lines.Line2D at 0x7f1b959b3438>]

指标名称	Hs300Alpha因子
因子收益序列>0的概率	0.45
t值绝对值的均值	0.99
t值绝对值大于等于2的概率	0.1
IC值的均值	-0.003
计算IC值的标准差	0.068
IC大于0的比例	0.47
IC绝对值大于0.02的比例	0.38
IR值	-0.05

量化交易吧 / 源码分享 帖子：3369590 新帖：26

【笔记】多因子系列报告之一：因子测试框架 （光大）

我是游客发表于：5 月 10 日 04：41回复(1)

【笔记】多因子系列报告之一：因子测试框架 （光大）

单因子测试具体步骤

样本筛选

获取沪深300指数成分股在测试周期内的因子值

单因子回归模型

单因子的有效性检验

分层回溯测试

全部回复

0/140

粉丝:473

帖子数:0

粉丝:555

帖子数:3

粉丝:686

帖子数:0

量化课程

热门标签

删除回复

确认要删除这篇文章么？

举报用户

信息提示

该文章已删除

设置置顶

完成设置【置顶】！

设置置顶

已取消设置【置顶】！

设置精华

完成设置【精华】！

设置精华

已取消设置【精华】！

审核信息

该文章已审核通过

审核信息

您已设置该文章审核不通过

举报成功

您已举报成功

用户登录

移动帖子

创建私信

屏蔽提示

确认要屏蔽该用户么？

屏蔽回复

您已对该用户实现屏蔽

信息回复

已发送成功

量化交易吧 / 源码分享帖子：3369590 新帖：26

【笔记】多因子系列报告之一：因子测试框架（光大）

【笔记】多因子系列报告之一：因子测试框架（光大）