呵呵
# 导入模块
import numpy as np
import pandas as pd
data = get_price("000001.XSHG",count=3000,end_date="2019-04-03",fields=["open","close","high","low","volume"])
print(data.head())
datas = data.reset_index(drop=True)
datas.head()
open close high low volume 2006-12-04 2103.82 2161.65 2164.59 2100.56 8.723279e+09 2006-12-05 2167.36 2173.28 2195.53 2159.32 8.815717e+09 2006-12-06 2175.38 2156.60 2192.90 2097.42 1.043478e+10 2006-12-07 2152.59 2156.75 2206.51 2145.79 9.927877e+09 2006-12-08 2133.36 2093.64 2164.02 2090.40 8.088972e+09
open | close | high | low | volume | |
---|---|---|---|---|---|
0 | 2103.82 | 2161.65 | 2164.59 | 2100.56 | 8.723279e+09 |
1 | 2167.36 | 2173.28 | 2195.53 | 2159.32 | 8.815717e+09 |
2 | 2175.38 | 2156.60 | 2192.90 | 2097.42 | 1.043478e+10 |
3 | 2152.59 | 2156.75 | 2206.51 | 2145.79 | 9.927877e+09 |
4 | 2133.36 | 2093.64 | 2164.02 | 2090.40 | 8.088972e+09 |
计算特征
# # 获取预测的标签 这里n为10 也就是预测10天后的涨跌标签
def compute_prediction_int(df, n):
pred = np.sign(df['close'].shift(-n) - df['close'])
pred = pred.iloc[:-n]
return pred.astype(int)
## 计算Stochastic Oscillator
def stochastic_oscillator_d(df, n):
SOK = [0]
for i in range(n, len(df)):
high = df.loc[(i-n):i, 'high']
low = df.loc[(i-n):i, 'low']
SOK.append((df.loc[i, 'close'] - min(low)) / (max(high) - min(low)))
SOK = pd.Series(SOK, name='SOK')
df = df.join(SOK)
return df
## 计算Williams %R
def williams_R(df, n):
R = [0]
for i in range(n, len(df)):
high = df.loc[(i-n):i, 'high']
low = df.loc[(i-n):i, 'low']
R.append((max(high) - df.loc[i, 'close']) / (max(high) - min(low))*(-100))
williams_R = pd.Series(R, name='williams_R')
df = df.join(williams_R)
return df
## 计算变化率
def rate_of_change(df, n):
M = df['close'].diff(n - 1)
N = df['close'].shift(n - 1)
ROC = pd.Series(M / N, name='ROC_' + str(n))
df = df.join(ROC)
return df
## 计算RSI
def relative_strength_index(df, n):
i = 0
UpI = [0]
DoI = [0]
while i + 1 <= df.index[-1]:
UpMove = df.loc[i + 1, 'high'] - df.loc[i, 'high']
DoMove = df.loc[i, 'low'] - df.loc[i + 1, 'low']
if UpMove > DoMove and UpMove > 0:
UpD = UpMove
else:
UpD = 0
UpI.append(UpD)
if DoMove > UpMove and DoMove > 0:
DoD = DoMove
else:
DoD = 0
DoI.append(DoD)
i = i + 1
UpI = pd.Series(UpI)
DoI = pd.Series(DoI)
PosDI = pd.Series(UpI.ewm(span=n, min_periods=n).mean())
NegDI = pd.Series(DoI.ewm(span=n, min_periods=n).mean())
RSI = pd.Series(PosDI / (PosDI + NegDI), name='RSI_' + str(n))
df = df.join(RSI)
return df
## 计算On Balance Volume
def on_balance_volume(df, n):
i = 0
OBV = [0]
while i < df.index[-1]:
if df.loc[i + 1, 'close'] - df.loc[i, 'close'] > 0:
OBV.append(df.loc[i + 1, 'volume'])
if df.loc[i + 1, 'close'] - df.loc[i, 'close'] == 0:
OBV.append(0)
if df.loc[i + 1, 'close'] - df.loc[i, 'close'] < 0:
OBV.append(-df.loc[i + 1, 'volume'])
i = i + 1
OBV = pd.Series(OBV)
OBV_ma = pd.Series(OBV.rolling(n, min_periods=n).mean(), name='OBV_' + str(n))
df = df.join(OBV_ma)
return df
## 计算MACD
def macd(df, n_fast, n_slow):
EMAfast = pd.Series(df['close'].ewm(span=n_fast, min_periods=n_slow).mean())
EMAslow = pd.Series(df['close'].ewm(span=n_slow, min_periods=n_slow).mean())
MACD = pd.Series(EMAfast - EMAslow, name='MACD_' + str(n_fast) + '_' + str(n_slow))
df = df.join(MACD)
return df
# 数据集准备
def feature_extraction(data):
data = relative_strength_index(data, n=14)
data = stochastic_oscillator_d(data, n=14)
data = rate_of_change(data, n=14)
data = on_balance_volume(data, n=14)
data = macd(data, 12, 26)
data = williams_R(data, n = 14)
del(data['open'])
del(data['high'])
del(data['low'])
del(data['volume'])
return data
def prepare_data(df, horizon):
data = feature_extraction(df).dropna().iloc[:-horizon]
data['label'] = compute_prediction_int(data, n=horizon)
del(data['close'])
return data.dropna()
# 数据和特征获取并合并
datas1 = prepare_data(datas, horizon=10)
features = [x for x in datas1.columns if x not in ['gain', 'label']]
print(datas1.head(5))
print(features)
RSI_14 SOK ROC_14 ... MACD_12_26 williams_R label 25 0.857233 0.813521 0.167247 ... 107.008562 -18.647937 1.0 26 0.640311 0.391454 0.138787 ... 99.453351 -60.854551 1.0 27 0.652436 0.388791 0.192446 ... 101.454929 -61.120866 1.0 28 0.685103 0.042725 0.158168 ... 103.734091 -95.727490 1.0 29 0.719127 0.005756 0.120646 ... 101.250194 -99.424404 1.0 [5 rows x 7 columns] ['RSI_14', 'SOK', 'ROC_14', 'OBV_14', 'MACD_12_26', 'williams_R']
# 训练集和测试集 划分
train_size = 2*len(datas1) // 3
train_df = datas1[:train_size]
test_df = datas1[train_size:]
print('len train', len(train_df))
print('len test', len(test_df))
print(train_df.head(5))
print(test_df.head(5))
len train 1961 len test 981 RSI_14 SOK ROC_14 ... MACD_12_26 williams_R label 25 0.857233 0.813521 0.167247 ... 107.008562 -18.647937 1.0 26 0.640311 0.391454 0.138787 ... 99.453351 -60.854551 1.0 27 0.652436 0.388791 0.192446 ... 101.454929 -61.120866 1.0 28 0.685103 0.042725 0.158168 ... 103.734091 -95.727490 1.0 29 0.719127 0.005756 0.120646 ... 101.250194 -99.424404 1.0 [5 rows x 7 columns] RSI_14 SOK ROC_14 ... MACD_12_26 williams_R label 1986 0.448424 0.998331 -0.059932 ... 29.119562 -0.166869 1.0 1987 0.400803 0.743751 0.006476 ... 18.827430 -25.624891 1.0 1988 0.311296 0.801043 -0.030614 ... 5.713452 -19.895707 1.0 1989 0.307633 0.693099 -0.068748 ... -3.093720 -30.690075 1.0 1990 0.359978 0.667756 -0.060344 ... -6.251662 -33.224405 1.0 [5 rows x 7 columns]
train_df.iloc[:,:-1].head()
RSI_14 | SOK | ROC_14 | OBV_14 | MACD_12_26 | williams_R | |
---|---|---|---|---|---|---|
25 | 0.857233 | 0.813521 | 0.167247 | 4.793883e+09 | 107.008562 | -18.647937 |
26 | 0.640311 | 0.391454 | 0.138787 | 3.468150e+09 | 99.453351 | -60.854551 |
27 | 0.652436 | 0.388791 | 0.192446 | 4.717751e+09 | 101.454929 | -61.120866 |
28 | 0.685103 | 0.042725 | 0.158168 | 5.008422e+09 | 103.734091 | -95.727490 |
29 | 0.719127 | 0.005756 | 0.120646 | 3.570444e+09 | 101.250194 | -99.424404 |
# 随机深林训练 导入包
from sklearn.ensemble import RandomForestClassifier
# 训练模型
clf = RandomForestClassifier(n_estimators=65, max_features="auto",max_depth=30,min_samples_split=200)
clf
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=30, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=200, min_weight_fraction_leaf=0.0, n_estimators=65, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False)
# from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, confusion_matrix, recall_score, accuracy_score
clf.fit(train_df.iloc[:,:-1],train_df.iloc[:,-1])
# 模型调用
pre_train = clf.predict(train_df.iloc[:,:-1])
# print("在训练集预测的结果为:",pre_train)
print("在训练集的accuracy_score为:",accuracy_score(pre_train,train_df.iloc[:,-1]))
pre_test = clf.predict(test_df.iloc[:,:-1])
# print("在测试集预测的结果为:",pre_test)
print("在测试集的accuracy_score为:",accuracy_score(pre_test,test_df.iloc[:,-1]))
在训练集的accuracy_score为: 0.7781744008159103 在测试集的accuracy_score为: 0.7492354740061162
#!usr/bin/env python
"""
Version : 0.1.6
Date : 15th April 2017
Author : Pierre-Yves Lablanche
Email : plablanche@aims.ac.za
Affiliation : African Institute for Mathematical Sciences - South Africa
Stellenbosch University - South Africa
License : MIT
Status : Not Under Active Development
Description :
Python3 implementation of the gcForest algorithm preesented in Zhou and Feng 2017
(paper can be found here : https://arxiv.org/abs/1702.08835 ).
It uses the typical scikit-learn syntax with a .fit() function for training
and a .predict() function for predictions.
"""
import itertools
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
__author__ = "Pierre-Yves Lablanche"
__email__ = "plablanche@aims.ac.za"
__license__ = "MIT"
__version__ = "0.1.6"
#__status__ = "Development"
# noinspection PyUnboundLocalVariable
class gcForest(object):
def __init__(self, shape_1X=None, n_mgsRFtree=30, window=None, stride=1,
cascade_test_size=0.2, n_cascadeRF=2, n_cascadeRFtree=101, cascade_layer=np.inf,
min_samples_mgs=0.1, min_samples_cascade=0.05, tolerance=0.0, n_jobs=1):
""" gcForest Classifier.
:param shape_1X: int or tuple list or np.array (default=None)
Shape of a single sample element [n_lines, n_cols]. Required when calling mg_scanning!
For sequence data a single int can be given.
:param n_mgsRFtree: int (default=30)
Number of trees in a Random Forest during Multi Grain Scanning.
:param window: int (default=None)
List of window sizes to use during Multi Grain Scanning.
If 'None' no slicing will be done.
:param stride: int (default=1)
Step used when slicing the data.
:param cascade_test_size: float or int (default=0.2)
Split fraction or absolute number for cascade training set splitting.
:param n_cascadeRF: int (default=2)
Number of Random Forests in a cascade layer.
For each pseudo Random Forest a complete Random Forest is created, hence
the total numbe of Random Forests in a layer will be 2*n_cascadeRF.
:param n_cascadeRFtree: int (default=101)
Number of trees in a single Random Forest in a cascade layer.
:param min_samples_mgs: float or int (default=0.1)
Minimum number of samples in a node to perform a split
during the training of Multi-Grain Scanning Random Forest.
If int number_of_samples = int.
If float, min_samples represents the fraction of the initial n_samples to consider.
:param min_samples_cascade: float or int (default=0.1)
Minimum number of samples in a node to perform a split
during the training of Cascade Random Forest.
If int number_of_samples = int.
If float, min_samples represents the fraction of the initial n_samples to consider.
:param cascade_layer: int (default=np.inf)
mMximum number of cascade layers allowed.
Useful to limit the contruction of the cascade.
:param tolerance: float (default=0.0)
Accuracy tolerance for the casacade growth.
If the improvement in accuracy is not better than the tolerance the construction is
stopped.
:param n_jobs: int (default=1)
The number of jobs to run in parallel for any Random Forest fit and predict.
If -1, then the number of jobs is set to the number of cores.
"""
setattr(self, 'shape_1X', shape_1X)
setattr(self, 'n_layer', 0)
setattr(self, '_n_samples', 0)
setattr(self, 'n_cascadeRF', int(n_cascadeRF))
if isinstance(window, int):
setattr(self, 'window', [window])
elif isinstance(window, list):
setattr(self, 'window', window)
setattr(self, 'stride', stride)
setattr(self, 'cascade_test_size', cascade_test_size)
setattr(self, 'n_mgsRFtree', int(n_mgsRFtree))
setattr(self, 'n_cascadeRFtree', int(n_cascadeRFtree))
setattr(self, 'cascade_layer', cascade_layer)
setattr(self, 'min_samples_mgs', min_samples_mgs)
setattr(self, 'min_samples_cascade', min_samples_cascade)
setattr(self, 'tolerance', tolerance)
setattr(self, 'n_jobs', n_jobs)
def fit(self, X, y):
""" Training the gcForest on input data X and associated target y.
:param X: np.array
Array containing the input samples.
Must be of shape [n_samples, data] where data is a 1D array.
:param y: np.array
1D array containing the target values.
Must be of shape [n_samples]
"""
if np.shape(X)[0] != len(y):
raise ValueError('Sizes of y and X do not match.')
mgs_X = self.mg_scanning(X, y)
_ = self.cascade_forest(mgs_X, y)
def predict_proba(self, X):
""" Predict the class probabilities of unknown samples X.
:param X: np.array
Array containing the input samples.
Must be of the same shape [n_samples, data] as the training inputs.
:return: np.array
1D array containing the predicted class probabilities for each input sample.
"""
mgs_X = self.mg_scanning(X)
cascade_all_pred_prob = self.cascade_forest(mgs_X)
predict_proba = np.mean(cascade_all_pred_prob, axis=0)
return predict_proba
def predict(self, X):
""" Predict the class of unknown samples X.
:param X: np.array
Array containing the input samples.
Must be of the same shape [n_samples, data] as the training inputs.
:return: np.array
1D array containing the predicted class for each input sample.
"""
pred_proba = self.predict_proba(X=X)
predictions = np.argmax(pred_proba, axis=1)
return predictions
def mg_scanning(self, X, y=None):
""" Performs a Multi Grain Scanning on input data.
:param X: np.array
Array containing the input samples.
Must be of shape [n_samples, data] where data is a 1D array.
:param y: np.array (default=None)
:return: np.array
Array of shape [n_samples, .. ] containing Multi Grain Scanning sliced data.
"""
setattr(self, '_n_samples', np.shape(X)[0])
shape_1X = getattr(self, 'shape_1X')
if isinstance(shape_1X, int):
shape_1X = [1,shape_1X]
if not getattr(self, 'window'):
setattr(self, 'window', [shape_1X[1]])
mgs_pred_prob = []
for wdw_size in getattr(self, 'window'):
wdw_pred_prob = self.window_slicing_pred_prob(X, wdw_size, shape_1X, y=y)
mgs_pred_prob.append(wdw_pred_prob)
return np.concatenate(mgs_pred_prob, axis=1)
def window_slicing_pred_prob(self, X, window, shape_1X, y=None):
""" Performs a window slicing of the input data and send them through Random Forests.
If target values 'y' are provided sliced data are then used to train the Random Forests.
:param X: np.array
Array containing the input samples.
Must be of shape [n_samples, data] where data is a 1D array.
:param window: int
Size of the window to use for slicing.
:param shape_1X: list or np.array
Shape of a single sample.
:param y: np.array (default=None)
Target values. If 'None' no training is done.
:return: np.array
Array of size [n_samples, ..] containing the Random Forest.
prediction probability for each input sample.
"""
n_tree = getattr(self, 'n_mgsRFtree')
min_samples = getattr(self, 'min_samples_mgs')
stride = getattr(self, 'stride')
if shape_1X[0] > 1:
print('Slicing Images...')
sliced_X, sliced_y = self._window_slicing_img(X, window, shape_1X, y=y, stride=stride)
else:
print('Slicing Sequence...')
sliced_X, sliced_y = self._window_slicing_sequence(X, window, shape_1X, y=y, stride=stride)
if y is not None:
n_jobs = getattr(self, 'n_jobs')
prf = RandomForestClassifier(n_estimators=n_tree, max_features='sqrt',
min_samples_split=min_samples, oob_score=True, n_jobs=n_jobs)
crf = RandomForestClassifier(n_estimators=n_tree, max_features=1,
min_samples_split=min_samples, oob_score=True, n_jobs=n_jobs)
print('Training MGS Random Forests...')
prf.fit(sliced_X, sliced_y)
crf.fit(sliced_X, sliced_y)
setattr(self, '_mgsprf_{}'.format(window), prf)
setattr(self, '_mgscrf_{}'.format(window), crf)
pred_prob_prf = prf.oob_decision_function_
pred_prob_crf = crf.oob_decision_function_
if hasattr(self, '_mgsprf_{}'.format(window)) and y is None:
prf = getattr(self, '_mgsprf_{}'.format(window))
crf = getattr(self, '_mgscrf_{}'.format(window))
pred_prob_prf = prf.predict_proba(sliced_X)
pred_prob_crf = crf.predict_proba(sliced_X)
pred_prob = np.c_[pred_prob_prf, pred_prob_crf]
return pred_prob.reshape([getattr(self, '_n_samples'), -1])
def _window_slicing_img(self, X, window, shape_1X, y=None, stride=1):
""" Slicing procedure for images
:param X: np.array
Array containing the input samples.
Must be of shape [n_samples, data] where data is a 1D array.
:param window: int
Size of the window to use for slicing.
:param shape_1X: list or np.array
Shape of a single sample [n_lines, n_cols].
:param y: np.array (default=None)
Target values.
:param stride: int (default=1)
Step used when slicing the data.
:return: np.array and np.array
Arrays containing the sliced images and target values (empty if 'y' is None).
"""
if any(s < window for s in shape_1X):
raise ValueError('window must be smaller than both dimensions for an image')
len_iter_x = np.floor_divide((shape_1X[1] - window), stride) + 1
len_iter_y = np.floor_divide((shape_1X[0] - window), stride) + 1
iterx_array = np.arange(0, stride*len_iter_x, stride)
itery_array = np.arange(0, stride*len_iter_y, stride)
ref_row = np.arange(0, window)
ref_ind = np.ravel([ref_row + shape_1X[1] * i for i in range(window)])
inds_to_take = [ref_ind + ix + shape_1X[1] * iy
for ix, iy in itertools.product(iterx_array, itery_array)]
sliced_imgs = np.take(X, inds_to_take, axis=1).reshape(-1, window**2)
if y is not None:
sliced_target = np.repeat(y, len_iter_x * len_iter_y)
elif y is None:
sliced_target = None
return sliced_imgs, sliced_target
def _window_slicing_sequence(self, X, window, shape_1X, y=None, stride=1):
""" Slicing procedure for sequences (aka shape_1X = [.., 1]).
:param X: np.array
Array containing the input samples.
Must be of shape [n_samples, data] where data is a 1D array.
:param window: int
Size of the window to use for slicing.
:param shape_1X: list or np.array
Shape of a single sample [n_lines, n_col].
:param y: np.array (default=None)
Target values.
:param stride: int (default=1)
Step used when slicing the data.
:return: np.array and np.array
Arrays containing the sliced sequences and target values (empty if 'y' is None).
"""
if shape_1X[1] < window:
raise ValueError('window must be smaller than the sequence dimension')
len_iter = np.floor_divide((shape_1X[1] - window), stride) + 1
iter_array = np.arange(0, stride*len_iter, stride)
ind_1X = np.arange(np.prod(shape_1X))
inds_to_take = [ind_1X[i:i+window] for i in iter_array]
sliced_sqce = np.take(X, inds_to_take, axis=1).reshape(-1, window)
if y is not None:
sliced_target = np.repeat(y, len_iter)
elif y is None:
sliced_target = None
return sliced_sqce, sliced_target
def cascade_forest(self, X, y=None):
""" Perform (or train if 'y' is not None) a cascade forest estimator.
:param X: np.array
Array containing the input samples.
Must be of shape [n_samples, data] where data is a 1D array.
:param y: np.array (default=None)
Target values. If 'None' perform training.
:return: np.array
1D array containing the predicted class for each input sample.
"""
if y is not None:
setattr(self, 'n_layer', 0)
test_size = getattr(self, 'cascade_test_size')
max_layers = getattr(self, 'cascade_layer')
tol = getattr(self, 'tolerance')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
self.n_layer += 1
prf_crf_pred_ref = self._cascade_layer(X_train, y_train)
accuracy_ref = self._cascade_evaluation(X_test, y_test)
feat_arr = self._create_feat_arr(X_train, prf_crf_pred_ref)
self.n_layer += 1
prf_crf_pred_layer = self._cascade_layer(feat_arr, y_train)
accuracy_layer = self._cascade_evaluation(X_test, y_test)
while accuracy_layer > (accuracy_ref + tol) and self.n_layer <= max_layers:
accuracy_ref = accuracy_layer
prf_crf_pred_ref = prf_crf_pred_layer
feat_arr = self._create_feat_arr(X_train, prf_crf_pred_ref)
self.n_layer += 1
prf_crf_pred_layer = self._cascade_layer(feat_arr, y_train)
accuracy_layer = self._cascade_evaluation(X_test, y_test)
if accuracy_layer < accuracy_ref :
n_cascadeRF = getattr(self, 'n_cascadeRF')
for irf in range(n_cascadeRF):
delattr(self, '_casprf{}_{}'.format(self.n_layer, irf))
delattr(self, '_cascrf{}_{}'.format(self.n_layer, irf))
self.n_layer -= 1
elif y is None:
at_layer = 1
prf_crf_pred_ref = self._cascade_layer(X, layer=at_layer)
while at_layer < getattr(self, 'n_layer'):
at_layer += 1
feat_arr = self._create_feat_arr(X, prf_crf_pred_ref)
prf_crf_pred_ref = self._cascade_layer(feat_arr, layer=at_layer)
return prf_crf_pred_ref
def _cascade_layer(self, X, y=None, layer=0):
""" Cascade layer containing Random Forest estimators.
If y is not None the layer is trained.
:param X: np.array
Array containing the input samples.
Must be of shape [n_samples, data] where data is a 1D array.
:param y: np.array (default=None)
Target values. If 'None' perform training.
:param layer: int (default=0)
Layer indice. Used to call the previously trained layer.
:return: list
List containing the prediction probabilities for all samples.
"""
n_tree = getattr(self, 'n_cascadeRFtree')
n_cascadeRF = getattr(self, 'n_cascadeRF')
min_samples = getattr(self, 'min_samples_cascade')
n_jobs = getattr(self, 'n_jobs')
prf = RandomForestClassifier(n_estimators=n_tree, max_features='sqrt',
min_samples_split=min_samples, oob_score=True, n_jobs=n_jobs)
crf = RandomForestClassifier(n_estimators=n_tree, max_features=1,
min_samples_split=min_samples, oob_score=True, n_jobs=n_jobs)
prf_crf_pred = []
if y is not None:
print('Adding/Training Layer, n_layer={}'.format(self.n_layer))
for irf in range(n_cascadeRF):
prf.fit(X, y)
crf.fit(X, y)
setattr(self, '_casprf{}_{}'.format(self.n_layer, irf), prf)
setattr(self, '_cascrf{}_{}'.format(self.n_layer, irf), crf)
prf_crf_pred.append(prf.oob_decision_function_)
prf_crf_pred.append(crf.oob_decision_function_)
elif y is None:
for irf in range(n_cascadeRF):
prf = getattr(self, '_casprf{}_{}'.format(layer, irf))
crf = getattr(self, '_cascrf{}_{}'.format(layer, irf))
prf_crf_pred.append(prf.predict_proba(X))
prf_crf_pred.append(crf.predict_proba(X))
return prf_crf_pred
def _cascade_evaluation(self, X_test, y_test):
""" Evaluate the accuracy of the cascade using X and y.
:param X_test: np.array
Array containing the test input samples.
Must be of the same shape as training data.
:param y_test: np.array
Test target values.
:return: float
the cascade accuracy.
"""
casc_pred_prob = np.mean(self.cascade_forest(X_test), axis=0)
casc_pred = np.argmax(casc_pred_prob, axis=1)
casc_accuracy = accuracy_score(y_true=y_test, y_pred=casc_pred)
print('Layer validation accuracy = {}'.format(casc_accuracy))
return casc_accuracy
def _create_feat_arr(self, X, prf_crf_pred):
""" Concatenate the original feature vector with the predicition probabilities
of a cascade layer.
:param X: np.array
Array containing the input samples.
Must be of shape [n_samples, data] where data is a 1D array.
:param prf_crf_pred: list
Prediction probabilities by a cascade layer for X.
:return: np.array
Concatenation of X and the predicted probabilities.
To be used for the next layer in a cascade forest.
"""
swap_pred = np.swapaxes(prf_crf_pred, 0, 1)
add_feat = swap_pred.reshape([np.shape(X)[0], -1])
feat_arr = np.concatenate([add_feat, X], axis=1)
return feat_arr
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
datas1.iloc[:,:-1], datas1.iloc[:,-1], test_size=0.33)
clf1 = clf = RandomForestClassifier(n_estimators=65, max_features="auto",max_depth=30,min_samples_split=200)
clf1.fit(X_train, y_train)
pre_test1 = clf1.predict(X_test)
print("在测试集的accuracy_score为:",accuracy_score(pre_test1, y_test))
在测试集的accuracy_score为: 0.7734294541709578
gcf = gcForest(shape_1X=6,window=4,stride=2)
gcf.fit(np.array(X_train),y_train)
y_pred = gcf.predict(np.array(X_test))
print("在测试集的accuracy_score为:",accuracy_score(y_pred, y_test))
Slicing Sequence... Training MGS Random Forests... Adding/Training Layer, n_layer=1 Layer validation accuracy = 0.4278481012658228 Adding/Training Layer, n_layer=2 Layer validation accuracy = 0.4278481012658228 Slicing Sequence... 在测试集的accuracy_score为: 0.45314109165808447
本社区仅针对特定人员开放
查看需注册登录并通过风险意识测评
5秒后跳转登录页面...
移动端课程