请 [注册] 或 [登录]  | 返回主站

量化交易吧 /  数理科学 帖子:3365634 新帖:21

财务指标对股票涨幅的机器学习简易框架

醒掌天下权发表于:5 月 9 日 18:30回复(1)

感谢陈轩的《基于机器学习的多因子选股策略》的策略,这篇研究的思路和一些基础函数来源于该策略。
在这里,提取机器学习的基本框架,用基本面因子作为输入,30天后涨幅作为输出,使用随机森林算法。
得到的结果效果并不好,这里仅作为框架分享。

def get_q_Factor(feasible_stocks):q = query(valuation.code, 
          valuation.market_cap,#市值  valuation.circulating_market_cap,  balance.total_assets - balance.total_liability,#净资产  balance.total_assets / balance.total_liability, 
          indicator.net_profit_to_total_revenue, #净利润/营业总收入  indicator.inc_revenue_year_on_year,  #营业收入增长率(同比)  balance.development_expenditure, #RD  valuation.pe_ratio, #市盈率(TTM)  valuation.pb_ratio, #市净率(TTM)  indicator.inc_net_profit_year_on_year,#净利润增长率(同比)  balance.dividend_payable,  indicator.roe,  indicator.roa,  income.operating_profit / income.total_profit, #OPTP  indicator.gross_profit_margin, #销售毛利率GPM  balance.fixed_assets / balance.total_assets, #FACR  valuation.pcf_ratio, #CFP  valuation.ps_ratio #PS).filter(valuation.code.in_(feasible_stocks))return qdef initialize_df(df):#定义列名df.columns = ['code', 'mcap', 'CMV','log_NC', 'LEV', 'NI_p', 'g', 'development_expenditure','pe','BP','G_p','dividend_payable','ROE','ROA','OPTP','GPM','FACR','pcf_ratio','PS']#标签:对数市值df['log_mcap'] = np.log(df['mcap'])#因子:df['EP'] = df['pe'].apply(lambda x: 1/x)df['BP'] = df['BP'].apply(lambda x: 1/x)df['DP'] = df['dividend_payable']/(df['mcap']*100000000)#因子:df['RD'] = df['development_expenditure']/(df['mcap']*100000000)# 因子:现金收益率df['CFP'] = df['pcf_ratio'].apply(lambda x: 1/x)df['log_NC'] = np.log(df['log_NC'])df['CMV'] = np.log(df['CMV'])#因子:净利润率df['NI_p'] = np.abs(df['NI_p'])#因子:df['NI_n'] = np.abs(df['NI_p'][df['NI_p']<0])   df['PEG'] = df['pe'] / (df['G_p']*100)del df['mcap']del df['pe']del df['dividend_payable']del df['pcf_ratio']del df['development_expenditure']df = df.fillna(0)return df__standardizeList = ['log_mcap','log_NC', 'LEV', 'NI_p', 'NI_n', 'g', 'RD','EP','BP','G_p','PEG','DP','CMV','ROE','ROA','OPTP','GPM','FACR','CFP','PS']
secCode = '000985.XSHG'sample = get_index_stocks(secCode)#print(len(sample))#print(sample)feasible_stocks = sampleq_Factor = get_q_Factor(feasible_stocks)#print(q_Factor)write_file("test.txt", str(q_Factor))from datetime import datetime, timedelta ,datefrom jqfactor import winsorize_medimport jsondate1 = datetime(2018,3,26)df_train = get_fundamentals(q_Factor, date = date1)df_train = initialize_df(df_train)write_file("机器学习/test.txt", str(df_train))write_file('机器学习/df_train.csv', df_train.to_csv(), append=False)from jqfactor import standardlizefor fac in __standardizeList:df_train[fac] = winsorize_med(df_train[fac], scale=5, inclusive=True, inf2nan=True, axis=0)    df_train[fac] = standardlize(df_train[fac], inf2nan=True, axis=0)write_file('df_train_std.csv', df_train.to_csv(), append=False)
/opt/conda/lib/python3.6/site-packages/jqdata/db_utils.py:234: SADeprecationWarning: Compiled objects now compile within the constructor.
  comp.compile()
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:63: RuntimeWarning: invalid value encountered in log
1170397
import pandas as pd#获取多支股票的收盘价pan = get_price(sample, start_date=date1, end_date=date1+timedelta(30), \               frequency='1d', fields=['close'])close = pan['close']write_file('机器学习/close.csv', close.to_csv(), append=False)#从收盘价获取30天涨幅closefirst = close.iloc[0]closelast = close.iloc[-1]rate = (closelast - closefirst)/closefirstrate = pd.DataFrame(rate,columns=['rate'])#具有财务指标的股票数量少于具有价格的股票数量,所以需要滤除rate = rate[rate.index.isin(list(df_train['code']))]write_file('机器学习/rate.csv', rate.to_csv(), append=False)write_file('机器学习/close.csv', close.to_csv(), append=False)returns = rate['rate']returns = winsorize_med(returns, scale=5, inclusive=True, inf2nan=True, axis=0)  returns = standardlize(returns, inf2nan=True, axis=0)returns = returns.fillna(0)from sklearn.ensemble import RandomForestRegressormodel = RandomForestRegressor(random_state=42,n_estimators=500,n_jobs=-1)__trainList = ['log_mcap','log_NC', 'LEV', 'NI_p','g', 'EP','BP','G_p','PEG','CMV','ROE','ROA','OPTP','GPM','FACR','CFP', 'PS']__trainList = ['BP']#trainval = df_train[__trainList]write_file('机器学习/trainval.csv', trainval.to_csv(), append=False)model.fit(trainval,returns)
/opt/conda/lib/python3.6/site-packages/jqresearch/api.py:87: FutureWarning: 
Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  pre_factor_ref_date=_get_today())
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)
secCode = '000985.XSHG'sample = get_index_stocks(secCode)feasible_stocks = sampleq_Factor = get_q_Factor(feasible_stocks)from datetime import datetime, timedelta ,datefrom jqfactor import winsorize_medimport jsondate2 = datetime(2018,5,24)df_test = get_fundamentals(q_Factor, date = date2)df_test = initialize_df(df_test)print(df_test.index)from jqfactor import standardlizefor fac in __standardizeList:df_train[fac] = winsorize_med(df_test[fac], scale=5, inclusive=True, inf2nan=True, axis=0)    df_train[fac] = standardlize(df_test[fac], inf2nan=True, axis=0)write_file('df_test_std.csv', df_test.to_csv(), append=False)testval = df_test[__trainList]returns_test = pd.DataFrame(model.predict(testval),columns=['returns_test'],index=df_test['code'])write_file('机器学习/return_test.csv', returns_test.to_csv(), append=False)returns_test.sort_values(by = ['returns_test'],axis = 0,ascending = False)
/opt/conda/lib/python3.6/site-packages/jqdata/db_utils.py:234: SADeprecationWarning: Compiled objects now compile within the constructor.
  comp.compile()
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:63: RuntimeWarning: invalid value encountered in log
RangeIndex(start=0, stop=3407, step=1)

.dataframe tbody tr th:only-of-type {        vertical-align: middle;    }    .dataframe tbody tr th {        vertical-align: top;    }    .dataframe thead th {        text-align: right;    }


returns_test
code
000733.XSHE2.105685
002328.XSHE1.962498
600480.XSHG1.957466
300256.XSHE1.957466
000718.XSHE1.957466
000159.XSHE1.957225
000560.XSHE1.941611
002553.XSHE1.934437
600635.XSHG1.929270
002479.XSHE1.923779
300384.XSHE1.909129
002563.XSHE1.909129
000829.XSHE1.909129
002897.XSHE1.904111
601006.XSHG1.883202
600246.XSHG1.883202
300115.XSHE1.871234
601668.XSHG1.867599
002233.XSHE1.855992
002588.XSHE1.855992
600960.XSHG1.855125
600854.XSHG1.855125
600079.XSHG1.850056
002729.XSHE1.831690
300225.XSHE1.831690
600536.XSHG1.831690
002895.XSHE1.831690
002424.XSHE1.831690
600720.XSHG1.820810
600280.XSHG1.804304
......
002521.XSHE-1.609499
002633.XSHE-1.618845
603022.XSHG-1.618845
000615.XSHE-1.620078
300709.XSHE-1.620078
002573.XSHE-1.620078
002910.XSHE-1.625790
603879.XSHG-1.625790
000735.XSHE-1.627859
600372.XSHG-1.659541
002162.XSHE-1.659541
300326.XSHE-1.659541
000815.XSHE-1.659541
300382.XSHE-1.659541
603098.XSHG-1.678690
601588.XSHG-1.683554
002622.XSHE-1.684119
002223.XSHE-1.688453
000868.XSHE-1.688453
300592.XSHE-1.690268
000029.XSHE-1.690268
002779.XSHE-1.690268
600531.XSHG-1.696337
600789.XSHG-1.716047
002303.XSHE-1.721863
600163.XSHG-1.777979
002635.XSHE-1.777979
002607.XSHE-1.777979
603991.XSHG-1.989981
300540.XSHE-1.989981

3407 rows × 1 columns

 

全部回复

0/140

量化课程

    移动端课程