# 第二步-因子检验
import time
import datetime
import jqdata
import datetime
from multiprocessing.dummy import Pool as ThreadPool
from jqfactor import Factor,calc_factors
import pandas as pd
import statsmodels.api as sm
import scipy.stats as st
import pickle
pkl_file = open('MyPackage.pkl', 'rb')
load_Package = pickle.load(pkl_file)
g_univ_dict,return_df,all_return_df,raw_factor_dict,all_factor_dict,all_industry_df=load_Package

univ_dict=g_univ_dict

# Step II: 因子筛选用到的函数
def ic_calculator(factor,return_df,univ_dict):
    ic_list=[]
    p_value_list=[]
    for date in sorted(list(univ_dict.keys())):   #这里是循环
        univ=univ_dict[date]
        univ=list(set(univ)&set(factor.loc[date].dropna().index)&set(return_df.loc[date].dropna().index))
        if len(univ)<10:
            continue
        factor_se=factor.loc[date,univ]
        return_se=return_df.loc[date,univ]
        ic,p_value=st.spearmanr(factor_se,return_se)
        ic_list.append(ic)
        p_value_list.append(p_value)
    return ic_list

def weighted_ic_calculator(factor,return_df,univ_dict,w=0.95):
    ic_list=[]
    p_value_list=[]
    for date in sorted(list(univ_dict.keys())):   #这里是循环
        univ=univ_dict[date]
        univ=list(set(univ)&set(factor.loc[date].dropna().index)&set(return_df.loc[date].dropna().index))
        if len(univ)<10:
            continue
        factor_se=factor.loc[date,univ]
        return_se=return_df.loc[date,univ]            
        df=pd.concat([factor_se.to_frame('factor'),return_se.to_frame('ret')],axis=1)
        ic,p_value=st.spearmanr(factor_se,return_se)
        signal=False if ic>0 else True
        df=df.sort('factor',ascending=signal)
        N=len(df)
        
        weight=w**np.arange(N)/sum(w**np.arange(N))
        df['weight']=weight
        A1=sum(df['weight']*df['factor']*df['ret'])
        A2=sum(df['weight']*df['factor'])
        A3=sum(df['weight']*df['ret'])
        B1=sum(df['weight']*df['factor']**2)
        B2=sum(df['weight']*df['ret']**2)
        weighted_ic=(A1-A2*A3)/(sqrt(B1-A2**2)*sqrt(B2-A3**2))
        ic_list.append(weighted_ic)
        
    return ic_list

def grouped_ic_calculator(factor,return_df,univ_dict,Group=20):
    ic_list=[]
    p_value_list=[]
    for date in sorted(list(univ_dict.keys())):   #这里是循环
        univ=univ_dict[date]
        univ=list(set(univ)&set(factor.loc[date].dropna().index)&set(return_df.loc[date].dropna().index))
        if len(univ)<10:
            continue
        factor_se=factor.loc[date,univ]
        return_se=return_df.loc[date,univ]    
        df=pd.concat([factor_se.to_frame('factor'),return_se.to_frame('ret')],axis=1)
        #ic,p_value=st.spearmanr(factor_se,return_se)
        #signal=False if ic>0 else True
        df=df.sort('factor',ascending=True)
        N=len(df)
        factor_grouped_list=[]
        ret_grouped_list=[]
        
        for i in arange(Group):            
            factor_grouped_list.append(df.ix[int(round(i/Group*N)):int(round((i+1)/Group*N-1)),'factor'].mean())
            ret_grouped_list.append(df.ix[int(round(i/Group*N)):int(round((i+1)/Group*N-1)),'ret'].mean())
        VCV=cov(np.array(ret_grouped_list),factor_grouped_list)
        grouped_ic=VCV[0,1]/sqrt(VCV[0,0]*VCV[1,1])        
        ic_list.append(grouped_ic)        
    return ic_list

starttime=time.clock()

print('\n计算IC:')
count=1
ic_list_dict={}
for key,factor in all_factor_dict.items():
    ic_list=ic_calculator(factor,return_df,univ_dict)
    ic_list_dict[key]=ic_list
    print(count,end=',')
    count=count+1
    
ic_df=pd.DataFrame(ic_list_dict,index=sorted(list(univ_dict.keys()))[:-1])
ic_df.mean().abs().hist()

print('\n计算Weighted_IC:')
count=1
weighted_ic_list_dict={}
for key,factor in all_factor_dict.items():
    weighted_ic_list=weighted_ic_calculator(factor,return_df,univ_dict)
    weighted_ic_list_dict[key]=weighted_ic_list
    print(count,end=',')
    count=count+1
    
weighted_ic_df=pd.DataFrame(weighted_ic_list_dict,index=sorted(list(univ_dict.keys()))[:-1])
weighted_ic_df.mean().abs().hist()


print('\n计算Grouped_IC:')
count=1
grouped_ic_list_dict={}
for key,factor in all_factor_dict.items():
    grouped_ic_list=grouped_ic_calculator(factor,return_df,univ_dict)
    grouped_ic_list_dict[key]=grouped_ic_list
    print(count,end=',')
    count=count+1
    
grouped_ic_df=pd.DataFrame(grouped_ic_list_dict,index=sorted(list(univ_dict.keys()))[:-1])
grouped_ic_df.mean().abs().hist()

endtime=time.clock()
runtime=endtime-starttime
print('因子生成运行完成，用时 %.2f 秒' % runtime)

计算IC:
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,
计算Weighted_IC:
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,
计算Grouped_IC:
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,因子生成运行完成，用时 122.53 秒

ic_df.mean().abs().hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7fc2b9926a58>

weighted_ic_df.mean().abs().hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7fc2bbf35f28>

grouped_ic_df.mean().abs().hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7fc2a7d86978>

# 如果你没有因子数据，那么就先运行第一步-因子生成，大约需要18分钟。
import time
import jqdata
import datetime
from multiprocessing.dummy import Pool as ThreadPool
from jqfactor import Factor,calc_factors
import pandas as pd
import statsmodels.api as sm
import scipy.stats as st
from jqfactor import get_factor_values
from jqfactor import winsorize,winsorize_med,neutralize,standardlize
import pickle

import xlrd   # 手工输入156个因子太麻烦，所以我就在EXCEL里上传了,也可手工输入。
ExcelFile=xlrd.open_workbook('FactorTable.xlsx')
name=ExcelFile.sheet_names()
sheet=ExcelFile.sheet_by_name(name[0])
factor_quality=list(sheet.col_values(1))
factor_fundamental=list(sheet.col_values(2))[:28]
factor_mood=list(sheet.col_values(3))[:35]
factor_growth=list(sheet.col_values(4))[:8]
factor_risk=list(sheet.col_values(5))[:12]
factor_stock=list(sheet.col_values(6))[:15]

starttime=time.clock()

global g_index
global g_count
global g_factor_list
global g_univ_dict
global g_neu_factor

g_index='000300.XSHG'
g_count=500
g_factor_list=factor_quality+factor_fundamental+factor_mood+factor_growth+factor_risk+factor_stock
g_neu_factor=factor_quality+factor_fundamental+factor_growth+factor_stock

def get_trade_dates(end,count=250,interval=20):
    date_list=list(jqdata.get_trade_days(end_date=end,count=count))
    date_list=date_list[::-1]
    date_list=list(filter(lambda x:date_list.index(x)%interval==0,date_list))
    date_list=date_list[::-1]
    return date_list

def get_stock_pool(date,index='all'):                    
    df=get_all_securities(types=['stock'],date=date)
    dayBefore=jqdata.get_trade_days(end_date=date,count=60)[0]      #上市不足60天
    df=df[df['start_date']<dayBefore]                               #上市不足count天的去掉
    universe_pool=list(df.index)
    if index=='all':
        stock_pool=universe_pool
    else:
        index_pool=get_index_stocks(index,date=date)
        stock_pool=list(set(index_pool)&set(universe_pool))
    return stock_pool

def get_stock_universe(trade_date_list,index='all'):               
    univ_list=[]
    univ_dict={}
    for date in trade_date_list:
        stock_pool=get_stock_pool(date,index)
        univ_list.append(stock_pool)
        univ_dict[date]=stock_pool
    return univ_list,univ_dict

def get_return(trade_date_list,count=250):     #小概率风险：一个股票曾经是指数成分股而如今已经退市      
    date=max(trade_date_list)
    universe=get_stock_pool(date,index='all')
    price=get_price(universe,end_date=date,count=count,fields=['close'],fq='pre')['close']
    return_df=price.loc[trade_date_list].pct_change().shift(-1)
    #return_df.index=dateTransform(return_df.index)
    all_return_df=price.pct_change().shift(-1)
    return return_df,all_return_df

def get_jq_factor_by_day(date):
    factor_dict=get_factor_values(securities=g_univ_dict[date], factors=g_factor_list, start_date=date, end_date=date)
    return factor_dict

def get_raw_factor_dict1(trade_date_list):
    raw_factor_dict={}
    # preset dict
    for factor in g_factor_list:
        raw_factor_dict[factor]=pd.DataFrame()

    # concate the factors
    for date in trade_date_list:
        all_factor_by_day=get_jq_factor_by_day(date)
        for factor in g_factor_list:
            raw_factor_dict[factor]=pd.concat([raw_factor_dict[factor],all_factor_by_day[factor]])
            
    return raw_factor_dict

def get_raw_factor_dict(trade_date_list):
    pool=ThreadPool(processes=len(trade_date_list))
    frame_list=pool.map(get_jq_factor_by_day,trade_date_list)
    pool.close()
    pool.join()
    raw_factor_dict={}
    count=0
    for factor in g_factor_list:
        y=[x[factor] for x in frame_list]
        y=pd.concat(y,axis=0)
        #y.index=dateTransform(y.index)                           ************************
        raw_factor_dict[factor]=y
        count=count+1
        print(count,end=',')
    return raw_factor_dict

def get_Industry_by_day(date):                                
    industry_set = ['801010', '801020', '801030', '801040', '801050', '801080', '801110', '801120', '801130', 
                  '801140', '801150', '801160', '801170', '801180', '801200', '801210', '801230', '801710',
                  '801720', '801730', '801740', '801750', '801760', '801770', '801780', '801790', '801880','801890']
    industry_df = pd.DataFrame(index=[date],columns=g_univ_dict[date])
    for industry in industry_set:
        industry_stocks = get_industry_stocks(industry,date = date)
        industry_stocks = list(set(industry_stocks)&set(g_univ_dict[date]))
        industry_df.loc[date,industry_stocks] = industry
    return industry_df

def get_industry_df(trade_date_list):    
    all_industry_df=pd.DataFrame()
    count=1
    for date in trade_date_list:
        all_industry_df=pd.concat([all_industry_df,get_Industry_by_day(date)],axis=0)
        print(count,end=',')
        count=count+1
    return all_industry_df

def replace_nan_indu(all_industry_df,factor_df,univ_dict):
    fill_factor=pd.DataFrame()
    for date in list(univ_dict.keys()):
        univ=univ_dict[date]
        factor_by_day=factor_df.loc[date,univ].to_frame('values')
        industry_by_day=all_industry_df.loc[date,univ].dropna().to_frame('industry')  #和后面的inner去除掉了没有行业的股票
        factor_by_day=factor_by_day.merge(industry_by_day,left_index=True,right_index=True,how='inner')
        mid=factor_by_day.groupby('industry').median()
        factor_by_day=factor_by_day.merge(mid,left_on='industry',right_index=True,how='left')
        factor_by_day.loc[pd.isnull(factor_by_day['values_x']),'values_x']=factor_by_day.loc[pd.isnull(factor_by_day['values_x']),'values_y']
        fill_factor=fill_factor.append(factor_by_day['values_x'].to_frame(date).T)
    return fill_factor

def pretreat_factor(factor_df,g_univ_dict,neu):
    pretreat_factor_df=pd.DataFrame(index=list(factor_df.index),columns=list(factor_df.columns))
    for date in sorted(list(g_univ_dict.keys())):
        factor_se=factor_df.loc[date,g_univ_dict[date]].dropna()
        factor_se=winsorize_med(factor_se, scale=3, inclusive=True, inf2nan=True, axis=1)   # winsorize
        if neu:
            factor_se=neutralize(factor_se, how=['jq_l1', 'market_cap'], date=date, axis=1)     # neutralize
        factor_se=standardlize(factor_se, inf2nan=True, axis=0)                             # standardize
        pretreat_factor_df.loc[date,list(factor_se.index)]=factor_se
    return pretreat_factor_df

def get_all_factor_dict(raw_factor_dict,g_univ_dict,all_industry_df):
    all_factor_dict={}
    count=0
    for key,raw_factor_df in raw_factor_dict.items():
        #把nan用行业中位数代替，依然会有nan，比如说整个行业没有该项数据，或者该行业仅有此一只股票，且为nan。
        factor_df=replace_nan_indu(all_industry_df,raw_factor_df,g_univ_dict)
        neu=True if key in g_neu_factor else False
        factor_df=pretreat_factor(factor_df,g_univ_dict,neu)
        all_factor_dict[key]=factor_df
        count=count+1
        print(count,end=',')
    return all_factor_dict

print('开始运行...')
today=datetime.date.today()                                               
yesterday=jqdata.get_trade_days(end_date=today,count=10)[0]                   # 获取回测最后一天日期
print('获取时间序列')
trade_date_list=get_trade_dates(yesterday,g_count,20)                        # 将用于计算的时间序列
print('获取股票池')
univ_list,g_univ_dict=get_stock_universe(trade_date_list,index=g_index)      # 获取股票池
print('获取历史回报')
return_df,all_return_df=get_return(trade_date_list,count=g_count)           # 获得所有股票的历史回报  (all stocks)
print('获取因子，共计%d个，进度：' % len(g_factor_list))
raw_factor_dict=get_raw_factor_dict(trade_date_list)
print('\n获取行业数据')
all_industry_df=get_industry_df(trade_date_list)
print('\n处理数据---去极值化/中性化/标准化，共计%d个，进度：'% len(g_factor_list))
all_factor_dict=get_all_factor_dict(raw_factor_dict,g_univ_dict,all_industry_df)
print('\npickle序列化')
Package=[g_univ_dict,return_df,all_return_df,raw_factor_dict,all_factor_dict,all_industry_df]
pkl_file = open('MyPackage.pkl', 'wb')
pickle.dump(Package,pkl_file,0)
pkl_file.close()
endtime=time.clock()
runtime=endtime-starttime
print('因子生成运行完成，用时 %.2f 秒' % runtime)

开始运行...
获取时间序列
获取股票池
获取历史回报
获取因子，共计156个，进度：
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,
获取行业数据
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,处理数据---去极值化/中性化/标准化，共计156个，进度：
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,
pickle序列化
因子生成运行完成，用时 1078.49 秒

量化交易吧 / 量化平台 帖子：3369513 新帖：3

用 IC 评价因子效果靠谱吗？-(利用分组或加权来提高IC准确度)

美联储主席发表于：9 月 7 日 20：00回复(1)

全部回复

0/140

粉丝:914

帖子数:0

粉丝:734

帖子数:0

粉丝:555

帖子数:0

量化课程

热门标签

删除回复

确认要删除这篇文章么？

举报用户

信息提示

该文章已删除

设置置顶

完成设置【置顶】！

设置置顶

已取消设置【置顶】！

设置精华

完成设置【精华】！

设置精华

已取消设置【精华】！

审核信息

该文章已审核通过

审核信息

您已设置该文章审核不通过

举报成功

您已举报成功

用户登录

移动帖子

创建私信

屏蔽提示

确认要屏蔽该用户么？

屏蔽回复

您已对该用户实现屏蔽

信息回复

已发送成功

量化交易吧 / 量化平台帖子：3369513 新帖：3