感谢陈轩的《基于机器学习的多因子选股策略》的策略,这篇研究的思路和一些基础函数来源于该策略。
在这里,提取机器学习的基本框架,用基本面因子作为输入,30天后涨幅作为输出,使用随机森林算法。
得到的结果效果并不好,这里仅作为框架分享。
def get_q_Factor(feasible_stocks):q = query(valuation.code, valuation.market_cap,#市值 valuation.circulating_market_cap, balance.total_assets - balance.total_liability,#净资产 balance.total_assets / balance.total_liability, indicator.net_profit_to_total_revenue, #净利润/营业总收入 indicator.inc_revenue_year_on_year, #营业收入增长率(同比) balance.development_expenditure, #RD valuation.pe_ratio, #市盈率(TTM) valuation.pb_ratio, #市净率(TTM) indicator.inc_net_profit_year_on_year,#净利润增长率(同比) balance.dividend_payable, indicator.roe, indicator.roa, income.operating_profit / income.total_profit, #OPTP indicator.gross_profit_margin, #销售毛利率GPM balance.fixed_assets / balance.total_assets, #FACR valuation.pcf_ratio, #CFP valuation.ps_ratio #PS).filter(valuation.code.in_(feasible_stocks))return qdef initialize_df(df):#定义列名df.columns = ['code', 'mcap', 'CMV','log_NC', 'LEV', 'NI_p', 'g', 'development_expenditure','pe','BP','G_p','dividend_payable','ROE','ROA','OPTP','GPM','FACR','pcf_ratio','PS']#标签:对数市值df['log_mcap'] = np.log(df['mcap'])#因子:df['EP'] = df['pe'].apply(lambda x: 1/x)df['BP'] = df['BP'].apply(lambda x: 1/x)df['DP'] = df['dividend_payable']/(df['mcap']*100000000)#因子:df['RD'] = df['development_expenditure']/(df['mcap']*100000000)# 因子:现金收益率df['CFP'] = df['pcf_ratio'].apply(lambda x: 1/x)df['log_NC'] = np.log(df['log_NC'])df['CMV'] = np.log(df['CMV'])#因子:净利润率df['NI_p'] = np.abs(df['NI_p'])#因子:df['NI_n'] = np.abs(df['NI_p'][df['NI_p']<0]) df['PEG'] = df['pe'] / (df['G_p']*100)del df['mcap']del df['pe']del df['dividend_payable']del df['pcf_ratio']del df['development_expenditure']df = df.fillna(0)return df__standardizeList = ['log_mcap','log_NC', 'LEV', 'NI_p', 'NI_n', 'g', 'RD','EP','BP','G_p','PEG','DP','CMV','ROE','ROA','OPTP','GPM','FACR','CFP','PS']
secCode = '000985.XSHG'sample = get_index_stocks(secCode)#print(len(sample))#print(sample)feasible_stocks = sampleq_Factor = get_q_Factor(feasible_stocks)#print(q_Factor)write_file("test.txt", str(q_Factor))from datetime import datetime, timedelta ,datefrom jqfactor import winsorize_medimport jsondate1 = datetime(2018,3,26)df_train = get_fundamentals(q_Factor, date = date1)df_train = initialize_df(df_train)write_file("机器学习/test.txt", str(df_train))write_file('机器学习/df_train.csv', df_train.to_csv(), append=False)from jqfactor import standardlizefor fac in __standardizeList:df_train[fac] = winsorize_med(df_train[fac], scale=5, inclusive=True, inf2nan=True, axis=0) df_train[fac] = standardlize(df_train[fac], inf2nan=True, axis=0)write_file('df_train_std.csv', df_train.to_csv(), append=False)
/opt/conda/lib/python3.6/site-packages/jqdata/db_utils.py:234: SADeprecationWarning: Compiled objects now compile within the constructor. comp.compile() /opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:63: RuntimeWarning: invalid value encountered in log
1170397
import pandas as pd#获取多支股票的收盘价pan = get_price(sample, start_date=date1, end_date=date1+timedelta(30), \ frequency='1d', fields=['close'])close = pan['close']write_file('机器学习/close.csv', close.to_csv(), append=False)#从收盘价获取30天涨幅closefirst = close.iloc[0]closelast = close.iloc[-1]rate = (closelast - closefirst)/closefirstrate = pd.DataFrame(rate,columns=['rate'])#具有财务指标的股票数量少于具有价格的股票数量,所以需要滤除rate = rate[rate.index.isin(list(df_train['code']))]write_file('机器学习/rate.csv', rate.to_csv(), append=False)write_file('机器学习/close.csv', close.to_csv(), append=False)returns = rate['rate']returns = winsorize_med(returns, scale=5, inclusive=True, inf2nan=True, axis=0) returns = standardlize(returns, inf2nan=True, axis=0)returns = returns.fillna(0)from sklearn.ensemble import RandomForestRegressormodel = RandomForestRegressor(random_state=42,n_estimators=500,n_jobs=-1)__trainList = ['log_mcap','log_NC', 'LEV', 'NI_p','g', 'EP','BP','G_p','PEG','CMV','ROE','ROA','OPTP','GPM','FACR','CFP', 'PS']__trainList = ['BP']#trainval = df_train[__trainList]write_file('机器学习/trainval.csv', trainval.to_csv(), append=False)model.fit(trainval,returns)
/opt/conda/lib/python3.6/site-packages/jqresearch/api.py:87: FutureWarning: Panel is deprecated and will be removed in a future version. The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/. Pandas provides a `.to_xarray()` method to help automate this conversion. pre_factor_ref_date=_get_today())
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1, oob_score=False, random_state=42, verbose=0, warm_start=False)
secCode = '000985.XSHG'sample = get_index_stocks(secCode)feasible_stocks = sampleq_Factor = get_q_Factor(feasible_stocks)from datetime import datetime, timedelta ,datefrom jqfactor import winsorize_medimport jsondate2 = datetime(2018,5,24)df_test = get_fundamentals(q_Factor, date = date2)df_test = initialize_df(df_test)print(df_test.index)from jqfactor import standardlizefor fac in __standardizeList:df_train[fac] = winsorize_med(df_test[fac], scale=5, inclusive=True, inf2nan=True, axis=0) df_train[fac] = standardlize(df_test[fac], inf2nan=True, axis=0)write_file('df_test_std.csv', df_test.to_csv(), append=False)testval = df_test[__trainList]returns_test = pd.DataFrame(model.predict(testval),columns=['returns_test'],index=df_test['code'])write_file('机器学习/return_test.csv', returns_test.to_csv(), append=False)returns_test.sort_values(by = ['returns_test'],axis = 0,ascending = False)
/opt/conda/lib/python3.6/site-packages/jqdata/db_utils.py:234: SADeprecationWarning: Compiled objects now compile within the constructor. comp.compile() /opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:63: RuntimeWarning: invalid value encountered in log
RangeIndex(start=0, stop=3407, step=1)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
returns_test | |
---|---|
code | |
000733.XSHE | 2.105685 |
002328.XSHE | 1.962498 |
600480.XSHG | 1.957466 |
300256.XSHE | 1.957466 |
000718.XSHE | 1.957466 |
000159.XSHE | 1.957225 |
000560.XSHE | 1.941611 |
002553.XSHE | 1.934437 |
600635.XSHG | 1.929270 |
002479.XSHE | 1.923779 |
300384.XSHE | 1.909129 |
002563.XSHE | 1.909129 |
000829.XSHE | 1.909129 |
002897.XSHE | 1.904111 |
601006.XSHG | 1.883202 |
600246.XSHG | 1.883202 |
300115.XSHE | 1.871234 |
601668.XSHG | 1.867599 |
002233.XSHE | 1.855992 |
002588.XSHE | 1.855992 |
600960.XSHG | 1.855125 |
600854.XSHG | 1.855125 |
600079.XSHG | 1.850056 |
002729.XSHE | 1.831690 |
300225.XSHE | 1.831690 |
600536.XSHG | 1.831690 |
002895.XSHE | 1.831690 |
002424.XSHE | 1.831690 |
600720.XSHG | 1.820810 |
600280.XSHG | 1.804304 |
... | ... |
002521.XSHE | -1.609499 |
002633.XSHE | -1.618845 |
603022.XSHG | -1.618845 |
000615.XSHE | -1.620078 |
300709.XSHE | -1.620078 |
002573.XSHE | -1.620078 |
002910.XSHE | -1.625790 |
603879.XSHG | -1.625790 |
000735.XSHE | -1.627859 |
600372.XSHG | -1.659541 |
002162.XSHE | -1.659541 |
300326.XSHE | -1.659541 |
000815.XSHE | -1.659541 |
300382.XSHE | -1.659541 |
603098.XSHG | -1.678690 |
601588.XSHG | -1.683554 |
002622.XSHE | -1.684119 |
002223.XSHE | -1.688453 |
000868.XSHE | -1.688453 |
300592.XSHE | -1.690268 |
000029.XSHE | -1.690268 |
002779.XSHE | -1.690268 |
600531.XSHG | -1.696337 |
600789.XSHG | -1.716047 |
002303.XSHE | -1.721863 |
600163.XSHG | -1.777979 |
002635.XSHE | -1.777979 |
002607.XSHE | -1.777979 |
603991.XSHG | -1.989981 |
300540.XSHE | -1.989981 |
3407 rows × 1 columns
本社区仅针对特定人员开放
查看需注册登录并通过风险意识测评
5秒后跳转登录页面...
移动端课程