# 第二步-因子检验
import time
import datetime
import jqdata
import datetime
from multiprocessing.dummy import Pool as ThreadPool
from jqfactor import Factor,calc_factors
import pandas as pd
import statsmodels.api as sm
import scipy.stats as st
import pickle
pkl_file = open('MyPackage.pkl', 'rb')
load_Package = pickle.load(pkl_file)
g_univ_dict,return_df,all_return_df,raw_factor_dict,all_factor_dict,all_industry_df=load_Package

univ_dict=g_univ_dict

# Step II: 因子筛选用到的函数
def ic_calculator(factor,return_df,univ_dict):
    ic_list=[]
    p_value_list=[]
    for date in sorted(list(univ_dict.keys())):   #这里是循环
        univ=univ_dict[date]
        univ=list(set(univ)&set(factor.loc[date].dropna().index)&set(return_df.loc[date].dropna().index))
        if len(univ)<10:
            continue
        factor_se=factor.loc[date,univ]
        return_se=return_df.loc[date,univ]
        ic,p_value=st.spearmanr(factor_se,return_se)
        ic_list.append(ic)
        p_value_list.append(p_value)
    return ic_list

def weighted_ic_calculator(factor,return_df,univ_dict,w=0.95):
    ic_list=[]
    p_value_list=[]
    for date in sorted(list(univ_dict.keys())):   #这里是循环
        univ=univ_dict[date]
        univ=list(set(univ)&set(factor.loc[date].dropna().index)&set(return_df.loc[date].dropna().index))
        if len(univ)<10:
            continue
        factor_se=factor.loc[date,univ]
        return_se=return_df.loc[date,univ]            
        df=pd.concat([factor_se.to_frame('factor'),return_se.to_frame('ret')],axis=1)
        ic,p_value=st.spearmanr(factor_se,return_se)
        signal=False if ic>0 else True
        df=df.sort('factor',ascending=signal)
        N=len(df)
        
        weight=w**np.arange(N)/sum(w**np.arange(N))
        df['weight']=weight
        A1=sum(df['weight']*df['factor']*df['ret'])
        A2=sum(df['weight']*df['factor'])
        A3=sum(df['weight']*df['ret'])
        B1=sum(df['weight']*df['factor']**2)
        B2=sum(df['weight']*df['ret']**2)
        weighted_ic=(A1-A2*A3)/(sqrt(B1-A2**2)*sqrt(B2-A3**2))
        ic_list.append(weighted_ic)
        
    return ic_list

def grouped_ic_calculator(factor,return_df,univ_dict,Group=20):
    ic_list=[]
    p_value_list=[]
    for date in sorted(list(univ_dict.keys())):   #这里是循环
        univ=univ_dict[date]
        univ=list(set(univ)&set(factor.loc[date].dropna().index)&set(return_df.loc[date].dropna().index))
        if len(univ)<10:
            continue
        factor_se=factor.loc[date,univ]
        return_se=return_df.loc[date,univ]    
        df=pd.concat([factor_se.to_frame('factor'),return_se.to_frame('ret')],axis=1)
        #ic,p_value=st.spearmanr(factor_se,return_se)
        #signal=False if ic>0 else True
        df=df.sort('factor',ascending=True)
        N=len(df)
        factor_grouped_list=[]
        ret_grouped_list=[]
        
        for i in arange(Group):            
            factor_grouped_list.append(df.ix[int(round(i/Group*N)):int(round((i+1)/Group*N-1)),'factor'].mean())
            ret_grouped_list.append(df.ix[int(round(i/Group*N)):int(round((i+1)/Group*N-1)),'ret'].mean())
        VCV=cov(np.array(ret_grouped_list),factor_grouped_list)
        grouped_ic=VCV[0,1]/sqrt(VCV[0,0]*VCV[1,1])        
        ic_list.append(grouped_ic)        
    return ic_list

starttime=time.clock()

print('\n计算IC:')
count=1
ic_list_dict={}
for key,factor in all_factor_dict.items():
    ic_list=ic_calculator(factor,return_df,univ_dict)
    ic_list_dict[key]=ic_list
    print(count,end=',')
    count=count+1
    
ic_df=pd.DataFrame(ic_list_dict,index=sorted(list(univ_dict.keys()))[:-1])
ic_df.mean().abs().hist()

print('\n计算Weighted_IC:')
count=1
weighted_ic_list_dict={}
for key,factor in all_factor_dict.items():
    weighted_ic_list=weighted_ic_calculator(factor,return_df,univ_dict)
    weighted_ic_list_dict[key]=weighted_ic_list
    print(count,end=',')
    count=count+1
    
weighted_ic_df=pd.DataFrame(weighted_ic_list_dict,index=sorted(list(univ_dict.keys()))[:-1])
weighted_ic_df.mean().abs().hist()


print('\n计算Grouped_IC:')
count=1
grouped_ic_list_dict={}
for key,factor in all_factor_dict.items():
    grouped_ic_list=grouped_ic_calculator(factor,return_df,univ_dict)
    grouped_ic_list_dict[key]=grouped_ic_list
    print(count,end=',')
    count=count+1
    
grouped_ic_df=pd.DataFrame(grouped_ic_list_dict,index=sorted(list(univ_dict.keys()))[:-1])
grouped_ic_df.mean().abs().hist()

endtime=time.clock()
runtime=endtime-starttime
print('因子生成运行完成，用时 %.2f 秒' % runtime)

计算IC:
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,
计算Weighted_IC:
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,
计算Grouped_IC:
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,因子生成运行完成，用时 122.53 秒

ic_df.mean().abs().hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7fc2b9926a58>

weighted_ic_df.mean().abs().hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7fc2bbf35f28>

grouped_ic_df.mean().abs().hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7fc2a7d86978>

# 如果你没有因子数据，那么就先运行第一步-因子生成，大约需要18分钟。
import time
import jqdata
import datetime
from multiprocessing.dummy import Pool as ThreadPool
from jqfactor import Factor,calc_factors
import pandas as pd
import statsmodels.api as sm
import scipy.stats as st
from jqfactor import get_factor_values
from jqfactor import winsorize,winsorize_med,neutralize,standardlize
import pickle

import xlrd   # 手工输入156个因子太麻烦，所以我就在EXCEL里上传了,也可手工输入。
ExcelFile=xlrd.open_workbook('FactorTable.xlsx')
name=ExcelFile.sheet_names()
sheet=ExcelFile.sheet_by_name(name[0])
factor_quality=list(sheet.col_values(1))
factor_fundamental=list(sheet.col_values(2))[:28]
factor_mood=list(sheet.col_values(3))[:35]
factor_growth=list(sheet.col_values(4))[:8]
factor_risk=list(sheet.col_values(5))[:12]
factor_stock=list(sheet.col_values(6))[:15]

starttime=time.clock()

global g_index
global g_count
global g_factor_list
global g_univ_dict
global g_neu_factor

g_index='000300.XSHG'
g_count=500
g_factor_list=factor_quality+factor_fundamental+factor_mood+factor_growth+factor_risk+factor_stock
g_neu_factor=factor_quality+factor_fundamental+factor_growth+factor_stock

def get_trade_dates(end,count=250,interval=20):
    date_list=list(jqdata.get_trade_days(end_date=end,count=count))
    date_list=date_list[::-1]
    date_list=list(filter(lambda x:date_list.index(x)%interval==0,date_list))
    date_list=date_list[::-1]
    return date_list

def get_stock_pool(date,index='all'):                    
    df=get_all_securities(types=['stock'],date=date)
    dayBefore=jqdata.get_trade_days(end_date=date,count=60)[0]      #上市不足60天
    df=df[df['start_date']<dayBefore]                               #上市不足count天的去掉
    universe_pool=list(df.index)
    if index=='all':
        stock_pool=universe_pool
    else:
        index_pool=get_index_stocks(index,date=date)
        stock_pool=list(set(index_pool)&set(universe_pool))
    return stock_pool

def get_stock_universe(trade_date_list,index='all'):               
    univ_list=[]
    univ_dict={}
    for date in trade_date_list:
        stock_pool=get_stock_pool(date,index)
        univ_list.append(stock_pool)
        univ_dict[date]=stock_pool
    return univ_list,univ_dict

def get_return(trade_date_list,count=250):     #小概率风险：一个股票曾经是指数成分股而如今已经退市      
    date=max(trade_date_list)
    universe=get_stock_pool(date,index='all')
    price=get_price(universe,end_date=date,count=count,fields=['close'],fq='pre')['close']
    return_df=price.loc[trade_date_list].pct_change().shift(-1)
    #return_df.index=dateTransform(return_df.index)
    all_return_df=price.pct_change().shift(-1)
    return return_df,all_return_df

def get_jq_factor_by_day(date):
    factor_dict=get_factor_values(securities=g_univ_dict[date], factors=g_factor_list, start_date=date, end_date=date)
    return factor_dict

def get_raw_factor_dict1(trade_date_list):
    raw_factor_dict={}
    # preset dict
    for factor in g_factor_list:
        raw_factor_dict[factor]=pd.DataFrame()

    # concate the factors
    for date in trade_date_list:
        all_factor_by_day=get_jq_factor_by_day(date)
        for factor in g_factor_list:
            raw_factor_dict[factor]=pd.concat([raw_factor_dict[factor],all_factor_by_day[factor]])
            
    return raw_factor_dict

def get_raw_factor_dict(trade_date_list):
    pool=ThreadPool(processes=len(trade_date_list))
    frame_list=pool.map(get_jq_factor_by_day,trade_date_list)
    pool.close()
    pool.join()
    raw_factor_dict={}
    count=0
    for factor in g_factor_list:
        y=[x[factor] for x in frame_list]
        y=pd.concat(y,axis=0)
        #y.index=dateTransform(y.index)                           ************************
        raw_factor_dict[factor]=y
        count=count+1
        print(count,end=',')
    return raw_factor_dict

def get_Industry_by_day(date):                                
    industry_set = ['801010', '801020', '801030', '801040', '801050', '801080', '801110', '801120', '801130', 
                  '801140', '801150', '801160', '801170', '801180', '801200', '801210', '801230', '801710',
                  '801720', '801730', '801740', '801750', '801760', '801770', '801780', '801790', '801880','801890']
    industry_df = pd.DataFrame(index=[date],columns=g_univ_dict[date])
    for industry in industry_set:
        industry_stocks = get_industry_stocks(industry,date = date)
        industry_stocks = list(set(industry_stocks)&set(g_univ_dict[date]))
        industry_df.loc[date,industry_stocks] = industry
    return industry_df

def get_industry_df(trade_date_list):    
    all_industry_df=pd.DataFrame()
    count=1
    for date in trade_date_list:
        all_industry_df=pd.concat([all_industry_df,get_Industry_by_day(date)],axis=0)
        print(count,end=',')
        count=count+1
    return all_industry_df

def replace_nan_indu(all_industry_df,factor_df,univ_dict):
    fill_factor=pd.DataFrame()
    for date in list(univ_dict.keys()):
        univ=univ_dict[date]
        factor_by_day=factor_df.loc[date,univ].to_frame('values')
        industry_by_day=all_industry_df.loc[date,univ].dropna().to_frame('industry')  #和后面的inner去除掉了没有行业的股票
        factor_by_day=factor_by_day.merge(industry_by_day,left_index=True,right_index=True,how='inner')
        mid=factor_by_day.groupby('industry').median()
        factor_by_day=factor_by_day.merge(mid,left_on='industry',right_index=True,how='left')
        factor_by_day.loc[pd.isnull(factor_by_day['values_x']),'values_x']=factor_by_day.loc[pd.isnull(factor_by_day['values_x']),'values_y']
        fill_factor=fill_factor.append(factor_by_day['values_x'].to_frame(date).T)
    return fill_factor

def pretreat_factor(factor_df,g_univ_dict,neu):
    pretreat_factor_df=pd.DataFrame(index=list(factor_df.index),columns=list(factor_df.columns))
    for date in sorted(list(g_univ_dict.keys())):
        factor_se=factor_df.loc[date,g_univ_dict[date]].dropna()
        factor_se=winsorize_med(factor_se, scale=3, inclusive=True, inf2nan=True, axis=1)   # winsorize
        if neu:
            factor_se=neutralize(factor_se, how=['jq_l1', 'market_cap'], date=date, axis=1)     # neutralize
        factor_se=standardlize(factor_se, inf2nan=True, axis=0)                             # standardize
        pretreat_factor_df.loc[date,list(factor_se.index)]=factor_se
    return pretreat_factor_df

def get_all_factor_dict(raw_factor_dict,g_univ_dict,all_industry_df):
    all_factor_dict={}
    count=0
    for key,raw_factor_df in raw_factor_dict.items():
        #把nan用行业中位数代替，依然会有nan，比如说整个行业没有该项数据，或者该行业仅有此一只股票，且为nan。
        factor_df=replace_nan_indu(all_industry_df,raw_factor_df,g_univ_dict)
        neu=True if key in g_neu_factor else False
        factor_df=pretreat_factor(factor_df,g_univ_dict,neu)
        all_factor_dict[key]=factor_df
        count=count+1
        print(count,end=',')
    return all_factor_dict

print('开始运行...')
today=datetime.date.today()                                               
yesterday=jqdata.get_trade_days(end_date=today,count=10)[0]                   # 获取回测最后一天日期
print('获取时间序列')
trade_date_list=get_trade_dates(yesterday,g_count,20)                        # 将用于计算的时间序列
print('获取股票池')
univ_list,g_univ_dict=get_stock_universe(trade_date_list,index=g_index)      # 获取股票池
print('获取历史回报')
return_df,all_return_df=get_return(trade_date_list,count=g_count)           # 获得所有股票的历史回报  (all stocks)
print('获取因子，共计%d个，进度：' % len(g_factor_list))
raw_factor_dict=get_raw_factor_dict(trade_date_list)
print('\n获取行业数据')
all_industry_df=get_industry_df(trade_date_list)
print('\n处理数据---去极值化/中性化/标准化，共计%d个，进度：'% len(g_factor_list))
all_factor_dict=get_all_factor_dict(raw_factor_dict,g_univ_dict,all_industry_df)
print('\npickle序列化')
Package=[g_univ_dict,return_df,all_return_df,raw_factor_dict,all_factor_dict,all_industry_df]
pkl_file = open('MyPackage.pkl', 'wb')
pickle.dump(Package,pkl_file,0)
pkl_file.close()
endtime=time.clock()
runtime=endtime-starttime
print('因子生成运行完成，用时 %.2f 秒' % runtime)

开始运行...
获取时间序列
获取股票池
获取历史回报
获取因子，共计156个，进度：
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,
获取行业数据
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,处理数据---去极值化/中性化/标准化，共计156个，进度：
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,
pickle序列化
因子生成运行完成，用时 1078.49 秒

用 IC 评价因子效果靠谱吗？-(利用分组或加权来提高IC准确度)

审核消息

该文章已通过审核

全部回复

0/140

热门文章最新文章

热门标签

更多人气分析师

财经资讯

行情数据