繁簡切換您正在訪問的是FX168財經網,本網站所提供的內容及信息均遵守中華人民共和國香港特別行政區當地法律法規。

FX168财经网>人物频道>帖子

自然语言处理之基于神经网络的情感分析

作者/wdsadad 2019-09-21 21:40 0 来源: FX168财经网人物频道
import json
import os
import sys
import numpy as np

from tensorflow.keras.initializers import Constant
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.layers import Flatten, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
###-------- Tokenizer用参数 ----------###
#最高频率单词数
num_words = 1000
#未出现词汇代表符号
oov_token='<oov_tok>'

###-------- Padding用参数 ----------###
#最大Padding长度
MAX_SEQUENCE_LENGTH = 16
#填充方式
padding = 'pre'
#截位方式
trunc_type = 'pre'

###--------Layer参数 -----------------###
# Embedding层参数
EMBEDDING_DIM = 100

###-------训练用参数 -----------------####
# 训练轮数
num_epoch = 10
# 训练集(包括验证集):测试集比率
train_ratio = 0.9
# 验证集:训练集比率
valid_ratio = 0.2
def parse_data(file):
    for l in open(file,'r'):
        yield json.loads(l)
sarcasm_data = list(parse_data('./Sarcasm_Headlines_Dataset_v2.json'))
def split_data(data):
    X = []
    y = []
    for s in data:
        X.append(s['headline'])
        y.append(s['is_sarcastic'])

    # 训练集(包含验证集):测试集 = 9:1
    trainval_size = int(len(X) * train_ratio)
    # 训练集及验证集
    X_trainval = X[:trainval_size]
    y_trainval = y[:trainval_size]
    # 训练集:验证集 = 8:2
    X_train, X_valid,y_train, y_valid = train_test_split(X_trainval,
                                                         y_trainval,
                                                         test_size = valid_ratio,
                                                         random_state=12)
    # 测试集
    X_test = X[trainval_size:]
    y_test = y[trainval_size:]
    
    return X_train,X_valid,X_test,y_train,y_valid,y_test
train_sentences,valid_sentences,test_sentences,train_labels, valid_labels,test_labels = split_data(sarcasm_data)
# 定义tokenizer  
# num_words:the maximum number of words to keep, based on word frequency. 
#           Only the most common num_words-1 words will be kept.
tokenizer = Tokenizer(num_words = num_words, oov_token=oov_token)
# 训练tokenizer
tokenizer.fit_on_texts(train_sentences)
# tokenizer.word_index
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

def get_paddedSequences(token,sentences):
    # tokenizer.texts_to_sequences
    sequences = token.texts_to_sequences(sentences)
    paddedSequecnes = pad_sequences(sequences,padding=padding, maxlen=MAX_SEQUENCE_LENGTH, truncating=trunc_type)

    return paddedSequecnes
Found 26237 unique tokens.
train_padded = get_paddedSequences(tokenizer,train_sentences)
valid_padded = get_paddedSequences(tokenizer,valid_sentences)

print(train_padded.shape)
print(valid_padded.shape)
(20605, 16)
(5152, 16)
def get_embedding_matrix():
    # 初始化embedding_matrix
    # EMBEDDING_DIM要与预训练词向量的维度保持一致
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

    # prepare embedding matrix
    embeddings_index = {}

    #从GloVe文件中解析出每个词和它所对应的词向量,并用字典的方式存储
    with open(os.path.join('glove.6B.100d.txt')) as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, 'f', sep=' ')
            embeddings_index[word] = coefs
    print('Found %s word vectors.' % len(embeddings_index))
    
    #根据得到的字典生成所定义的词向量矩阵
    for word, i in word_index.items():
        if i >= num_words:
            continue
        embedding_vector = embeddings_index.get(word)

        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    
    return embedding_matrix
embedding_matrix = get_embedding_matrix()
Found 70823 word vectors.
# Embedding + Flatten
model = Sequential([
    # num_words,embdedding_dim都会影响准确率和损失函数
    # # load pre-trained word embeddings into an Embedding layer
    # note that we set trainable = False so as to keep the embeddings fixed
    # output_shape:(None, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)
    Embedding(num_words,
                EMBEDDING_DIM,
                #embeddings_initializer=Constant(embedding_matrix),              
                input_length=MAX_SEQUENCE_LENGTH,
                trainable=True),
                #trainable=False),
    Flatten(), 
    Dense(6, activation='relu'),
    Dense(1, activation='sigmoid')
])
# Embedding + Conv1D + GlobalMaxPooling1D
model = Sequential([
    # num_words,embdedding_dim都会影响准确率和损失函数
    # # load pre-trained word embeddings into an Embedding layer
    # note that we set trainable = False so as to keep the embeddings fixed
    Embedding(num_words,
                EMBEDDING_DIM,
                #embeddings_initializer=Constant(embedding_matrix),              
                input_length=MAX_SEQUENCE_LENGTH,
                trainable=True),                
                #trainable=False),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(), 
    Dense(6, activation='relu'),
    Dense(1, activation='sigmoid')
])
# Embedding + LSTM
model = Sequential([
    # num_words,embdedding_dim都会影响准确率和损失函数
    Embedding(num_words,
                EMBEDDING_DIM,
                #embeddings_initializer=Constant(embedding_matrix),
                input_length=MAX_SEQUENCE_LENGTH,
                trainable=True),
                #trainable=False),
    Bidirectional(LSTM(64)),     
    Dense(6, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_2 (Embedding)      (None, 16, 100)           100000    
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               84480     
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 774       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 7         
=================================================================
Total params: 185,261
Trainable params: 185,261
Non-trainable params: 0
_________________________________________________________________
%%time
history = model.fit(train_padded, 
                    train_labels, 
                    epochs=num_epoch, 
                    validation_data=(valid_padded,valid_labels),
                    verbose=2)
Train on 20605 samples, validate on 5152 samples
Epoch 1/10
 - 276s - loss: 0.4459 - acc: 0.7797 - val_loss: 0.3904 - val_acc: 0.8193
Epoch 2/10
 - 274s - loss: 0.3600 - acc: 0.8352 - val_loss: 0.3690 - val_acc: 0.8261
Epoch 3/10
 - 274s - loss: 0.3340 - acc: 0.8513 - val_loss: 0.3719 - val_acc: 0.8278
Epoch 4/10
 - 274s - loss: 0.3124 - acc: 0.8620 - val_loss: 0.3686 - val_acc: 0.8346
Epoch 5/10
 - 270s - loss: 0.2958 - acc: 0.8688 - val_loss: 0.3706 - val_acc: 0.8313
Epoch 6/10
 - 274s - loss: 0.2781 - acc: 0.8760 - val_loss: 0.3752 - val_acc: 0.8274
Epoch 7/10
 - 273s - loss: 0.2630 - acc: 0.8834 - val_loss: 0.3845 - val_acc: 0.8329
Epoch 8/10
 - 274s - loss: 0.2470 - acc: 0.8901 - val_loss: 0.3952 - val_acc: 0.8284
Epoch 9/10
 - 274s - loss: 0.2321 - acc: 0.8961 - val_loss: 0.4278 - val_acc: 0.8286
Epoch 10/10
 - 269s - loss: 0.2146 - acc: 0.9036 - val_loss: 0.4418 - val_acc: 0.8224
CPU times: user 32min 33s, sys: 12min 54s, total: 45min 27s
Wall time: 45min 35s
import matplotlib.pyplot as plt

def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "acc")
plot_graphs(history, "loss")
test_padded = get_paddedSequences(tokenizer,test_sentences)

result = model.evaluate(test_padded, test_labels)
print('Accuracy:{0:.2%}'.format(result[1]))

from sklearn.metrics import f1_score
pred_labels = model.predict(test_padded)
pred_labels_int = [1 if prob[0] > 0.5 else 0 for prob in pred_labels]
print('f1-score:{0:.2%}'.format(f1_score(test_labels, pred_labels_int)))
2862/2862 [==============================] - 7s 3ms/step
Accuracy:82.35%
f1-score:81.32%
 
分享到:
举报财经168客户端下载

全部回复

0/140

投稿 您想发表你的观点和看法?

更多人气分析师

  • 张亦巧

    人气2184文章4145粉丝45

    暂无个人简介信息

  • 梁孟梵

    人气2176文章3177粉丝39

    qq:2294906466 了解群指导添加微信mfmacd

  • 指导老师

    人气1864文章4423粉丝52

    暂无个人简介信息

  • 李冉晴

    人气2320文章3821粉丝34

    李冉晴,专业现贷实盘分析师。

  • 王启蒙现货黄金

    人气296文章3142粉丝8

    本人做分析师以来,并专注于贵金属投资市场,尤其是在现货黄金...

  • 张迎妤

    人气1896文章3305粉丝34

    个人专注于行情技术分析,消息面解读剖析,给予您第一时间方向...

  • 金泰铬J

    人气2328文章3925粉丝51

    投资问答解咨询金泰铬V/信tgtg67即可获取每日的实时资讯、行情...

  • 金算盘

    人气2696文章7761粉丝125

    高级分析师,混过名校,厮杀于股市和期货、证券市场多年,专注...

  • 金帝财神

    人气4760文章8329粉丝119

    本文由资深分析师金帝财神微信:934295330,指导黄金,白银,...

FX168财经

FX168财经学院

FX168财经

FX168北美