from jqfactor import Factor, calc_factors
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
stock = get_index_stocks('000300.XSHG')

class Hs300Alpha(Factor):
    # 设置因子名称
    name = 'hs300_alpha'
    # 设置获取数据的时间窗口长度
    max_window = 10
    # 设置依赖的数据
    dependencies = ['close']

    # 计算因子的函数， 需要返回一个 pandas.Series, index 是股票代码，value 是因子值
    def calc(self, data):
        # 获取个股的收盘价数据
        close = data['close']
        # 计算个股近10日收益
        stock_return = close.iloc[-1,:]/close.iloc[0,:] -1
        # 获取指数（沪深300）的收盘价数据
        index_close = self._get_extra_data(securities=['000300.XSHG'], fields=['close'])['close']
        # 计算指数的近10日收益
        index_return = index_close.iat[-1,0]/index_close.iat[0,0] - 1
        # 计算 alpha
        alpha = stock_return - index_return
        return alpha
factors = calc_factors(stock, [Hs300Alpha()], start_date='2015-01-01', end_date='2017-12-31')

data=factors['hs300_alpha']

#处理缺失值
output=pd.DataFrame()
for i in range(300):
    p=sum(data.iloc[:,i].isnull())/len(data.iloc[:,i])
    if p<0.2:
        data.iloc[:,i].fillna(mean(data.iloc[:,i]))
        output[i]=data.iloc[:,i]
output.columns=data.columns[output.columns]
output=output.fillna(mean(data))

#异常值处理
for i in range(len(output.columns)):
    MAD=median(abs(output.iloc[:,i]-median(output.iloc[:,i])))
    MAX=median(output.iloc[:,i])+3*1.4826*MAD
    MIN=median(output.iloc[:,i])-3*1.4826*MAD
    output.iloc[:,i][output.iloc[:,1]>MAX]=MAX
    output.iloc[:,i][output.iloc[:,1]<MIN]=MIN

#标准化
for i in range(len(output.columns)):
    output.iloc[:,i]=(output.iloc[:,i]-mean(output.iloc[:,i]))/std(output.iloc[:,i])

#获得行业哑变量矩阵
from jqdata import *
sw=get_industries(name='sw_l1').index
industry=pd.DataFrame(0,columns=output.columns,index=range(0,28))
for i in range(len(sw)):
    temp=list(set(output.columns).intersection(set(get_industry_stocks(sw[i]))))
    industry.loc[i,temp]=1

#去除市值、行业因素，得到新的因子值 
newx=pd.DataFrame()
for i in range(len(output.index)):
    m= get_fundamentals(query(valuation.circulating_cap,valuation.code).filter(valuation.code.in_(output.columns)), date=output.index[i])
    m.index=np.array(m['code'])
    m=m.iloc[:,0]
    m=(m-mean(m))/std(m)
    x=output.iloc[i,:]
    conc=pd.concat([x,m,industry.T],axis=1).fillna(mean(m))
    est=sm.OLS(conc.iloc[:,0],conc.iloc[:,1:]).fit()
    y_fitted = est.fittedvalues
    newx[i]=est.resid
newx=newx.T
newx.index=output.index
newx=newx.iloc[1:,:]
#看图
'''
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(conc.iloc[:,1],conc.iloc[:,0], 'o', label='data')
ax.plot(conc.iloc[:,1], y_fitted, 'r--.',label='OLS')
'''

"\nfig, ax = plt.subplots(figsize=(8,6))\nax.plot(conc.iloc[:,1],conc.iloc[:,0], 'o', label='data')\nax.plot(conc.iloc[:,1], y_fitted, 'r--.',label='OLS')\n"

#将因子值和y值匹配
output=output.iloc[:-1,:]
df = get_price(list(output.columns), start_date='2017-01-01', end_date='2017-12-31', frequency='daily', fields=['close'])
y=df['close'].diff()/np.array(df['close'])
y=y.iloc[1:,:]
y=y.fillna(mean(y))
y = y.drop((y.index).difference(newx.index))

#做回归 求回归系数
f=[0]*len(y.index)
t=[0]*len(y.index)
for i in range(len(y.index)):
    rlm_model = sm.RLM(y.iloc[i,:], newx.iloc[i,:], M=sm.robust.norms.HuberT()).fit()
    f[i]=float(rlm_model.params)
    t[i]=float(rlm_model.tvalues)
    '''
    #对回归的结果画图
    y_fit=rlm_model.fittedvalues
    fig, ax = plt.subplots(figsize=(8,6))
    ax.plot(newx,y, 'o', label='data')
    ax.plot(newx, y_fit, 'r--.',label='OLS')
    '''

#检验结果
#因子收益序列>0的概率
sum(pd.Series(f)>0)/len(f) 
#t值绝对值的均值---回归假设检验的t值
mean(abs(pd.Series(t))) 
#t值绝对值大于等于2的概率---回归假设检验的t值
sum(abs(pd.Series(t))>2)/len(t) 
#计算IC值序列
IC=[0]*len(y.columns)
for i in range(len(y.columns)):
    IC[i]=corrcoef(pd.Series(f).rank(),y.iloc[:,i].rank())[0,1]
#计算IC值的均值
mean(IC) 
#计算IC值的标准差
std(IC)
#IC大于0的比例
sum(pd.Series(IC)>0)/len(IC)
#IC绝对值大于0.02的比例
sum(pd.Series(IC)>0.02)/len(IC)
#IR值
mean(IC)/std(IC)

-0.0032439286775737546

#因子收益时间序列图
plt.bar(range(len(f)),f)

<Container object of 242 artists>

#因子收益分布直方图
plt.hist(f)

(array([  1.,   2.,   4.,  21.,  41.,  91.,  56.,  20.,   4.,   2.]),
 array([-0.00606028, -0.00500213, -0.00394397, -0.00288582, -0.00182767,
        -0.00076951,  0.00028864,  0.00134679,  0.00240495,  0.0034631 ,
         0.00452125]),
 <a list of 10 Patch objects>)

#回归因子收益t值绝对值
plt.bar(range(len(t)),abs(pd.Series(t)))

<Container object of 228 artists>

#因子IC值序列
plt.bar(range(len(IC)),pd.Series(IC))

<Container object of 290 artists>

#做分层回溯法
newy=df['close']/df['close'].iloc[0,:]-1
newy=newy.iloc[1:,:]
newy=newy.fillna(mean(y))
newy = newy.drop((newy.index).difference(newx.index))
fc1=[0]*len(newx.index)
fc2=[0]*len(newx.index)
fc3=[0]*len(newx.index)
fc4=[0]*len(newx.index)
fc5=[0]*len(newx.index)
for i in range(len(newx.index)):
    d1=newx.iloc[i,:][newx.iloc[i,:].rank()<=(len(newx.index)/5)].index
    d2=newx.iloc[i,:][newx.iloc[i,:].rank()<=(len(newx.index)/5*2) ].index
    d2=(d2).difference(d1)
    d3=newx.iloc[i,:][newx.iloc[i,:].rank()<=(len(newx.index)/5*3) ].index
    d3=(d3).difference(d2)
    d4=newx.iloc[i,:][newx.iloc[i,:].rank()<=(len(newx.index)/5*4) ].index
    d4=(d4).difference(d3)
    d5=newx.iloc[i,:][newx.iloc[i,:].rank()<=(len(newx.index)/5*5) ].index
    d5=(d5).difference(d4)
    mean1=mean(newy.iloc[i,:][d1])
    mean2=mean(newy.iloc[i,:][d2])
    mean3=mean(newy.iloc[i,:][d3])
    mean4=mean(newy.iloc[i,:][d4])
    mean5=mean(newy.iloc[i,:][d5])
    fc1[i]=mean1
    fc2[i]=mean2
    fc3[i]=mean3
    fc4[i]=mean4
    fc5[i]=mean5
fcsum=pd.DataFrame(fc1)
fcsum[1]=fc2
fcsum[2]=fc3
fcsum[3]=fc4
fcsum[4]=fc5

#画图
plt.plot(fcsum)

[<matplotlib.lines.Line2D at 0x7f1b959abc50>,
 <matplotlib.lines.Line2D at 0x7f1b959abe10>,
 <matplotlib.lines.Line2D at 0x7f1b959abfd0>,
 <matplotlib.lines.Line2D at 0x7f1b959b3240>,
 <matplotlib.lines.Line2D at 0x7f1b959b3438>]

【笔记】多因子系列报告之一：因子测试框架（光大）

【笔记】多因子系列报告之一：因子测试框架（光大）

单因子测试具体步骤

样本筛选

获取沪深300指数成分股在测试周期内的因子值

单因子回归模型

单因子的有效性检验

分层回溯测试

审核消息

该文章已通过审核

全部回复

0/140

热门文章最新文章

热门标签

更多人气分析师

财经资讯

行情数据

指标名称	Hs300Alpha因子
因子收益序列>0的概率	0.45
t值绝对值的均值	0.99
t值绝对值大于等于2的概率	0.1
IC值的均值	-0.003
计算IC值的标准差	0.068
IC大于0的比例	0.47
IC绝对值大于0.02的比例	0.38
IR值	-0.05

【笔记】多因子系列报告之一：因子测试框架 （光大）

【笔记】多因子系列报告之一：因子测试框架 （光大）

单因子测试具体步骤

样本筛选

获取沪深300指数成分股在测试周期内的因子值

单因子回归模型

单因子的有效性检验

分层回溯测试

审核消息

该文章已通过审核

全部回复

0/140

热门文章最新文章

热门标签

更多人气分析师

财经资讯

行情数据

【笔记】多因子系列报告之一：因子测试框架（光大）

【笔记】多因子系列报告之一：因子测试框架（光大）