# 第二步-因子检验
import time
import datetime
import jqdata
import datetime
from multiprocessing.dummy import Pool as ThreadPool
from jqfactor import Factor,calc_factors
import pandas as pd
import statsmodels.api as sm
import scipy.stats as st
import pickle
pkl_file = open('MyPackage.pkl', 'rb')
load_Package = pickle.load(pkl_file)
g_univ_dict,return_df,all_return_df,raw_factor_dict,all_factor_dict,all_industry_df=load_Package
univ_dict=g_univ_dict
# Step II: 因子筛选用到的函数
def ic_calculator(factor,return_df,univ_dict):
ic_list=[]
p_value_list=[]
for date in sorted(list(univ_dict.keys())): #这里是循环
univ=univ_dict[date]
univ=list(set(univ)&set(factor.loc[date].dropna().index)&set(return_df.loc[date].dropna().index))
if len(univ)<10:
continue
factor_se=factor.loc[date,univ]
return_se=return_df.loc[date,univ]
ic,p_value=st.spearmanr(factor_se,return_se)
ic_list.append(ic)
p_value_list.append(p_value)
return ic_list
def weighted_ic_calculator(factor,return_df,univ_dict,w=0.95):
ic_list=[]
p_value_list=[]
for date in sorted(list(univ_dict.keys())): #这里是循环
univ=univ_dict[date]
univ=list(set(univ)&set(factor.loc[date].dropna().index)&set(return_df.loc[date].dropna().index))
if len(univ)<10:
continue
factor_se=factor.loc[date,univ]
return_se=return_df.loc[date,univ]
df=pd.concat([factor_se.to_frame('factor'),return_se.to_frame('ret')],axis=1)
ic,p_value=st.spearmanr(factor_se,return_se)
signal=False if ic>0 else True
df=df.sort('factor',ascending=signal)
N=len(df)
weight=w**np.arange(N)/sum(w**np.arange(N))
df['weight']=weight
A1=sum(df['weight']*df['factor']*df['ret'])
A2=sum(df['weight']*df['factor'])
A3=sum(df['weight']*df['ret'])
B1=sum(df['weight']*df['factor']**2)
B2=sum(df['weight']*df['ret']**2)
weighted_ic=(A1-A2*A3)/(sqrt(B1-A2**2)*sqrt(B2-A3**2))
ic_list.append(weighted_ic)
return ic_list
def grouped_ic_calculator(factor,return_df,univ_dict,Group=20):
ic_list=[]
p_value_list=[]
for date in sorted(list(univ_dict.keys())): #这里是循环
univ=univ_dict[date]
univ=list(set(univ)&set(factor.loc[date].dropna().index)&set(return_df.loc[date].dropna().index))
if len(univ)<10:
continue
factor_se=factor.loc[date,univ]
return_se=return_df.loc[date,univ]
df=pd.concat([factor_se.to_frame('factor'),return_se.to_frame('ret')],axis=1)
#ic,p_value=st.spearmanr(factor_se,return_se)
#signal=False if ic>0 else True
df=df.sort('factor',ascending=True)
N=len(df)
factor_grouped_list=[]
ret_grouped_list=[]
for i in arange(Group):
factor_grouped_list.append(df.ix[int(round(i/Group*N)):int(round((i+1)/Group*N-1)),'factor'].mean())
ret_grouped_list.append(df.ix[int(round(i/Group*N)):int(round((i+1)/Group*N-1)),'ret'].mean())
VCV=cov(np.array(ret_grouped_list),factor_grouped_list)
grouped_ic=VCV[0,1]/sqrt(VCV[0,0]*VCV[1,1])
ic_list.append(grouped_ic)
return ic_list
starttime=time.clock()
print('\n计算IC:')
count=1
ic_list_dict={}
for key,factor in all_factor_dict.items():
ic_list=ic_calculator(factor,return_df,univ_dict)
ic_list_dict[key]=ic_list
print(count,end=',')
count=count+1
ic_df=pd.DataFrame(ic_list_dict,index=sorted(list(univ_dict.keys()))[:-1])
ic_df.mean().abs().hist()
print('\n计算Weighted_IC:')
count=1
weighted_ic_list_dict={}
for key,factor in all_factor_dict.items():
weighted_ic_list=weighted_ic_calculator(factor,return_df,univ_dict)
weighted_ic_list_dict[key]=weighted_ic_list
print(count,end=',')
count=count+1
weighted_ic_df=pd.DataFrame(weighted_ic_list_dict,index=sorted(list(univ_dict.keys()))[:-1])
weighted_ic_df.mean().abs().hist()
print('\n计算Grouped_IC:')
count=1
grouped_ic_list_dict={}
for key,factor in all_factor_dict.items():
grouped_ic_list=grouped_ic_calculator(factor,return_df,univ_dict)
grouped_ic_list_dict[key]=grouped_ic_list
print(count,end=',')
count=count+1
grouped_ic_df=pd.DataFrame(grouped_ic_list_dict,index=sorted(list(univ_dict.keys()))[:-1])
grouped_ic_df.mean().abs().hist()
endtime=time.clock()
runtime=endtime-starttime
print('因子生成运行完成,用时 %.2f 秒' % runtime)
ic_df.mean().abs().hist()
weighted_ic_df.mean().abs().hist()
grouped_ic_df.mean().abs().hist()
# 如果你没有因子数据,那么就先运行第一步-因子生成,大约需要18分钟。
import time
import jqdata
import datetime
from multiprocessing.dummy import Pool as ThreadPool
from jqfactor import Factor,calc_factors
import pandas as pd
import statsmodels.api as sm
import scipy.stats as st
from jqfactor import get_factor_values
from jqfactor import winsorize,winsorize_med,neutralize,standardlize
import pickle
import xlrd # 手工输入156个因子太麻烦,所以我就在EXCEL里上传了,也可手工输入。
ExcelFile=xlrd.open_workbook('FactorTable.xlsx')
name=ExcelFile.sheet_names()
sheet=ExcelFile.sheet_by_name(name[0])
factor_quality=list(sheet.col_values(1))
factor_fundamental=list(sheet.col_values(2))[:28]
factor_mood=list(sheet.col_values(3))[:35]
factor_growth=list(sheet.col_values(4))[:8]
factor_risk=list(sheet.col_values(5))[:12]
factor_stock=list(sheet.col_values(6))[:15]
starttime=time.clock()
global g_index
global g_count
global g_factor_list
global g_univ_dict
global g_neu_factor
g_index='000300.XSHG'
g_count=500
g_factor_list=factor_quality+factor_fundamental+factor_mood+factor_growth+factor_risk+factor_stock
g_neu_factor=factor_quality+factor_fundamental+factor_growth+factor_stock
def get_trade_dates(end,count=250,interval=20):
date_list=list(jqdata.get_trade_days(end_date=end,count=count))
date_list=date_list[::-1]
date_list=list(filter(lambda x:date_list.index(x)%interval==0,date_list))
date_list=date_list[::-1]
return date_list
def get_stock_pool(date,index='all'):
df=get_all_securities(types=['stock'],date=date)
dayBefore=jqdata.get_trade_days(end_date=date,count=60)[0] #上市不足60天
df=df[df['start_date']<dayBefore] #上市不足count天的去掉
universe_pool=list(df.index)
if index=='all':
stock_pool=universe_pool
else:
index_pool=get_index_stocks(index,date=date)
stock_pool=list(set(index_pool)&set(universe_pool))
return stock_pool
def get_stock_universe(trade_date_list,index='all'):
univ_list=[]
univ_dict={}
for date in trade_date_list:
stock_pool=get_stock_pool(date,index)
univ_list.append(stock_pool)
univ_dict[date]=stock_pool
return univ_list,univ_dict
def get_return(trade_date_list,count=250): #小概率风险:一个股票曾经是指数成分股而如今已经退市
date=max(trade_date_list)
universe=get_stock_pool(date,index='all')
price=get_price(universe,end_date=date,count=count,fields=['close'],fq='pre')['close']
return_df=price.loc[trade_date_list].pct_change().shift(-1)
#return_df.index=dateTransform(return_df.index)
all_return_df=price.pct_change().shift(-1)
return return_df,all_return_df
def get_jq_factor_by_day(date):
factor_dict=get_factor_values(securities=g_univ_dict[date], factors=g_factor_list, start_date=date, end_date=date)
return factor_dict
def get_raw_factor_dict1(trade_date_list):
raw_factor_dict={}
# preset dict
for factor in g_factor_list:
raw_factor_dict[factor]=pd.DataFrame()
# concate the factors
for date in trade_date_list:
all_factor_by_day=get_jq_factor_by_day(date)
for factor in g_factor_list:
raw_factor_dict[factor]=pd.concat([raw_factor_dict[factor],all_factor_by_day[factor]])
return raw_factor_dict
def get_raw_factor_dict(trade_date_list):
pool=ThreadPool(processes=len(trade_date_list))
frame_list=pool.map(get_jq_factor_by_day,trade_date_list)
pool.close()
pool.join()
raw_factor_dict={}
count=0
for factor in g_factor_list:
y=[x[factor] for x in frame_list]
y=pd.concat(y,axis=0)
#y.index=dateTransform(y.index) ************************
raw_factor_dict[factor]=y
count=count+1
print(count,end=',')
return raw_factor_dict
def get_Industry_by_day(date):
industry_set = ['801010', '801020', '801030', '801040', '801050', '801080', '801110', '801120', '801130',
'801140', '801150', '801160', '801170', '801180', '801200', '801210', '801230', '801710',
'801720', '801730', '801740', '801750', '801760', '801770', '801780', '801790', '801880','801890']
industry_df = pd.DataFrame(index=[date],columns=g_univ_dict[date])
for industry in industry_set:
industry_stocks = get_industry_stocks(industry,date = date)
industry_stocks = list(set(industry_stocks)&set(g_univ_dict[date]))
industry_df.loc[date,industry_stocks] = industry
return industry_df
def get_industry_df(trade_date_list):
all_industry_df=pd.DataFrame()
count=1
for date in trade_date_list:
all_industry_df=pd.concat([all_industry_df,get_Industry_by_day(date)],axis=0)
print(count,end=',')
count=count+1
return all_industry_df
def replace_nan_indu(all_industry_df,factor_df,univ_dict):
fill_factor=pd.DataFrame()
for date in list(univ_dict.keys()):
univ=univ_dict[date]
factor_by_day=factor_df.loc[date,univ].to_frame('values')
industry_by_day=all_industry_df.loc[date,univ].dropna().to_frame('industry') #和后面的inner去除掉了没有行业的股票
factor_by_day=factor_by_day.merge(industry_by_day,left_index=True,right_index=True,how='inner')
mid=factor_by_day.groupby('industry').median()
factor_by_day=factor_by_day.merge(mid,left_on='industry',right_index=True,how='left')
factor_by_day.loc[pd.isnull(factor_by_day['values_x']),'values_x']=factor_by_day.loc[pd.isnull(factor_by_day['values_x']),'values_y']
fill_factor=fill_factor.append(factor_by_day['values_x'].to_frame(date).T)
return fill_factor
def pretreat_factor(factor_df,g_univ_dict,neu):
pretreat_factor_df=pd.DataFrame(index=list(factor_df.index),columns=list(factor_df.columns))
for date in sorted(list(g_univ_dict.keys())):
factor_se=factor_df.loc[date,g_univ_dict[date]].dropna()
factor_se=winsorize_med(factor_se, scale=3, inclusive=True, inf2nan=True, axis=1) # winsorize
if neu:
factor_se=neutralize(factor_se, how=['jq_l1', 'market_cap'], date=date, axis=1) # neutralize
factor_se=standardlize(factor_se, inf2nan=True, axis=0) # standardize
pretreat_factor_df.loc[date,list(factor_se.index)]=factor_se
return pretreat_factor_df
def get_all_factor_dict(raw_factor_dict,g_univ_dict,all_industry_df):
all_factor_dict={}
count=0
for key,raw_factor_df in raw_factor_dict.items():
#把nan用行业中位数代替,依然会有nan,比如说整个行业没有该项数据,或者该行业仅有此一只股票,且为nan。
factor_df=replace_nan_indu(all_industry_df,raw_factor_df,g_univ_dict)
neu=True if key in g_neu_factor else False
factor_df=pretreat_factor(factor_df,g_univ_dict,neu)
all_factor_dict[key]=factor_df
count=count+1
print(count,end=',')
return all_factor_dict
print('开始运行...')
today=datetime.date.today()
yesterday=jqdata.get_trade_days(end_date=today,count=10)[0] # 获取回测最后一天日期
print('获取时间序列')
trade_date_list=get_trade_dates(yesterday,g_count,20) # 将用于计算的时间序列
print('获取股票池')
univ_list,g_univ_dict=get_stock_universe(trade_date_list,index=g_index) # 获取股票池
print('获取历史回报')
return_df,all_return_df=get_return(trade_date_list,count=g_count) # 获得所有股票的历史回报 (all stocks)
print('获取因子,共计%d个,进度:' % len(g_factor_list))
raw_factor_dict=get_raw_factor_dict(trade_date_list)
print('\n获取行业数据')
all_industry_df=get_industry_df(trade_date_list)
print('\n处理数据---去极值化/中性化/标准化,共计%d个,进度:'% len(g_factor_list))
all_factor_dict=get_all_factor_dict(raw_factor_dict,g_univ_dict,all_industry_df)
print('\npickle序列化')
Package=[g_univ_dict,return_df,all_return_df,raw_factor_dict,all_factor_dict,all_industry_df]
pkl_file = open('MyPackage.pkl', 'wb')
pickle.dump(Package,pkl_file,0)
pkl_file.close()
endtime=time.clock()
runtime=endtime-starttime
print('因子生成运行完成,用时 %.2f 秒' % runtime)