商品評論情感化分析案例(LDA主題分析)

2021-03-02 Python面面觀
分析內容文章目錄
因為內容較長，這裡附上文章內容目錄（上傳圖片有些失真模糊，湊合看看了）：
數據清洗、預處理
文本數據，是一種非結構化數據。因此，其預處理的步驟與方式也會與結構化數據有所差異。文本數據預處理主要包括：
去除完全重複的數據
import pandas as pd
import re
import jieba.posseg as psg
import numpy as np
reviews = pd.read_csv('/reviews.csv')
reviews = reviews[['content', 'content_type']].drop_duplicates()
content = reviews['content']
reviews.head()
去除去除英文、數字等。去除業務相關詞（難以識別）# 去除去除英文、數字等
# 由於評論主要為京東美的電熱水器的評論，因此去除這些詞語
strinfo = re.compile('[0-9a-zA-Z]|京東|美的|電熱水器|熱水器|')
content = content.apply(lambda x: strinfo.sub('', x))

進行分詞、詞性標註# 分詞、詞性標註
# 自定義簡單分詞函數
worker = lambda s: [(x.word, x.flag) for x in psg.cut(s)] 
seg_word = content.apply(worker) 
seg_word.head()
0    [(東西, ns), (收到, v), (這麼久, r), (，, x), (都, d), ...
1    [(安裝, v), (師傅, nr), (很, d), (給, p), (力, n), (，...
2    [(還, d), (沒, v), (安裝, v), (，, x), (基本, n), (滿意...
3    [(收到, v), (了, ul), (，, x), (自營, vn), (商品, n), ...
4    [(用, p), (了, ul), (幾次, m), (才, d), (來, v), (評價...
Name: content, dtype: object

將詞語轉為DataFrame；刪除標點符號# 將詞語轉為dataframe形式，一列是詞，一列是詞語所在的句子ID，最後一列是詞語在該句子的位置
# 每一評論中詞的個數
n_word = seg_word.apply(lambda x: len(x))  
n_content = [[x+1]*y for x,y in zip(list(seg_word.index), list(n_word))]

# 將嵌套的列表展開，作為詞所在評論的id
index_content = sum(n_content, [])  
seg_word = sum(seg_word, [])

# 詞列表
word = [x[0] for x in seg_word]  

# 詞性列表
nature = [x[1] for x in seg_word]  

# 評論類型
content_type = [[x]*y for x,y in zip(list(reviews['content_type']), list(n_word))]
content_type = sum(content_type, [])  

result = pd.DataFrame({"index_content":index_content, 
                       "word":word,
                       "nature":nature,
                       "content_type":content_type}) 

# 刪除標點符號
result = result[result['nature'] != 'x']  # x表示標點符號
result.head()
刪除停用詞# 刪除停用詞，導入停用詞詞庫
stop_path = open("/stoplist.txt", 'r',encoding='UTF-8')
stop = stop_path.readlines()
stop = [x.replace('\n', '') for x in stop]
word = list(set(word) - set(stop))
result = result[result['word'].isin(word)]
# 構造各詞在對應評論的位置列
n_word = list(result.groupby(by = ['index_content'])['index_content'].count())

index_word = [list(np.arange(0, y)) for y in n_word]

# 表示詞語在該評論的位置
index_word = sum(index_word, [])  

# 合併評論id，評論中詞的id，詞，詞性，評論類型
result['index_word'] = index_word

數據探索
名詞探索ind = result[['n' in x for x in result['nature']]]
ind.head()
繪製名詞詞雲查看效果import matplotlib.pyplot as plt
from wordcloud import WordCloud

frequencies = result.groupby(by = ['word'])['word'].count()
frequencies = frequencies.sort_values(ascending = False)
wordcloud = WordCloud(font_path='/System/Library/Fonts/STHeiti Light.ttc',
                      max_words=70,
                      height=800, # 高度設置為600
                      width=1200, # 寬度設置為1000
                      scale=20,
                      background_color='white'
                     )
my_wordcloud = wordcloud.fit_words(frequencies)
plt.imshow(my_wordcloud)
plt.axis('off') 
plt.show()

修正情感傾向，繪製正面/負面情感詞詞雲# 匹配情感詞
word = pd.read_csv("/word.csv")

# 讀入正面、負面情感評價詞
pos_comment = pd.read_csv("/正面評價詞語（中文）.txt", header=None,sep="\n", 
                          encoding = 'utf-8', engine='python')
neg_comment = pd.read_csv("/負面評價詞語（中文）.txt", header=None,sep="\n", 
                          encoding = 'utf-8', engine='python')
pos_emotion = pd.read_csv("/正面情感詞語（中文）.txt", header=None,sep="\n", 
                          encoding = 'utf-8', engine='python')
neg_emotion = pd.read_csv("/負面情感詞語（中文）.txt", header=None,sep="\n", 
                          encoding = 'utf-8', engine='python') 

# 合併情感詞與評價詞，set取併集
positive = set(pos_comment.iloc[:,0])|set(pos_emotion.iloc[:,0])
negative = set(neg_comment.iloc[:,0])|set(neg_emotion.iloc[:,0])


intersection = positive&negative  
positive = list(positive - intersection)
negative = list(negative - intersection)
positive = pd.DataFrame({"word":positive,
                         "weight":[1]*len(positive)})
negative = pd.DataFrame({"word":negative,
                         "weight":[-1]*len(negative)}) 

posneg = positive.append(negative)

# 將分詞結果與正負面情感詞表合併，定位情感詞
data_posneg = posneg.merge(word, left_on = 'word', right_on = 'word', 
                           how = 'right')
data_posneg = data_posneg.sort_values(by = ['index_content','index_word'])
# 修正情感傾向

# 根據情感詞前時候有否定詞或雙層否定詞對情感值進行修正
# 載入否定詞表
notdict = pd.read_csv("data/not.csv")

# 處理否定修飾詞
# 構造新列，作為經過否定詞修正後的情感值
data_posneg['amend_weight'] = data_posneg['weight']  
data_posneg['id'] = np.arange(0, len(data_posneg))

# 只保留有情感值的詞語
only_inclination = data_posneg.dropna()  
only_inclination.index = np.arange(0, len(only_inclination))
index = only_inclination['id']

for i in np.arange(0, len(only_inclination)):
    review = data_posneg[data_posneg['index_content'] == 
                         only_inclination['index_content'][i]]  # 提取第i個情感詞所在的評論
    review.index = np.arange(0, len(review))
    affective = only_inclination['index_word'][i]  # 第i個情感值在該文檔的位置
    if affective == 1:
        ne = sum([i in notdict['term'] for i in review['word'][affective - 1]])
        if ne == 1:
            data_posneg['amend_weight'][index[i]] = -\
            data_posneg['weight'][index[i]]          
    elif affective > 1:
        ne = sum([i in notdict['term'] for i in review['word'][[affective - 1, 
                  affective - 2]]])
        if ne == 1:
            data_posneg['amend_weight'][index[i]] = -\
            data_posneg['weight'][index[i]]
            
# 更新只保留情感值的數據
only_inclination = only_inclination.dropna()

# 計算每條評論的情感值
emotional_value = only_inclination.groupby(['index_content'],as_index=False)['amend_weight'].sum()
                                           

# 去除情感值為0的評論
emotional_value = emotional_value[emotional_value['amend_weight'] != 0]


# 查看情感分析效果

# 給情感值大於0的賦予評論類型（content_type）為pos,小於0的為neg
emotional_value['a_type'] = ''
emotional_value['a_type'][emotional_value['amend_weight'] > 0] = 'pos'
emotional_value['a_type'][emotional_value['amend_weight'] < 0] = 'neg'

# 查看情感分析結果
result = emotional_value.merge(word, 
                               left_on = 'index_content', 
                               right_on = 'index_content',
                               how = 'left')

result = result[['index_content','content_type', 'a_type']].drop_duplicates() 
confusion_matrix = pd.crosstab(result['content_type'], result['a_type'], 
                               margins=True)  # 製作交叉表
(confusion_matrix.iat[0,0] + confusion_matrix.iat[1,1])/confusion_matrix.iat[2,2]

# 提取正負面評論信息
ind_pos = list(emotional_value[emotional_value['a_type'] == 'pos']['index_content'])
ind_neg = list(emotional_value[emotional_value['a_type'] == 'neg']['index_content'])
posdata = word[[i in ind_pos for i in word['index_content']]]
negdata = word[[i in ind_neg for i in word['index_content']]]
# 繪製詞雲
# import matplotlib.pyplot as plt
# from wordcloud import WordCloud


# 正面情感詞詞雲
freq_pos = posdata.groupby(by = ['word'])['word'].count()
freq_pos = freq_pos.sort_values(ascending = False)
# backgroud_Image=plt.imread('../data/pl.jpg')
wordcloud = WordCloud(font_path='/System/Library/Fonts/STHeiti Light.ttc',
                      max_words=100,
                      background_color='white'
                      )
pos_wordcloud = wordcloud.fit_words(freq_pos)
plt.imshow(pos_wordcloud)
plt.axis('off') 
plt.show()


# 負面情感詞詞雲
freq_neg = negdata.groupby(by = ['word'])['word'].count()
freq_neg = freq_neg.sort_values(ascending = False)
neg_wordcloud = wordcloud.fit_words(freq_neg)
plt.imshow(neg_wordcloud)
plt.axis('off') 
plt.show()

snowNLP 查看整體情感傾向
查看整體情緒頻數分布from snownlp import SnowNLP
sentimentslist = []
for i in ind.word:
    s = SnowNLP(i)
    sentimentslist.append(s.sentiments)

import matplotlib.pyplot as plt
import numpy as np
plt.rcParams['font.family'] = ['Arial Unicode MS'] 
plt.figure(figsize=(12,8))


plt.hist(sentimentslist, bins = np.arange(0, 1, 0.01), facecolor = 'g')
plt.xlabel('情緒頻數')
plt.ylabel('數量')
plt.title('整體傾向')
plt.show()



查看整體情緒波動result = []
i = 0
while i<len(sentimentslist[:500]):
    result.append(sentimentslist[i]-0.5)
    i = i + 1

#可視化畫圖
import matplotlib.pyplot as plt
plt.figure(figsize=(12,8))

# import numpy as np
plt.plot(result, 'k-')
plt.xlabel('數量')
plt.ylabel('情緒')
plt.title('整體傾向')
plt.show()



文本向量化（TfidfVectorizer, CountVectorizer）from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
n_features = 1000
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                max_features=n_features,
                                stop_words='english',
                                max_df = 0.5,
                                min_df = 10)
tf = tf_vectorizer.fit_transform(ind.word)
tf_vectorizer
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.5, max_features=1000, min_df=10,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

LDA主題分析from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

lda.fit(tf)
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=50.0,
                          max_doc_update_iter=100, max_iter=50,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
n_top_words = 20
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)
Topic #0:
信賴 上門 服務態度 材料費 時間 管子 建議 差勁 封頂 謝謝 商品 客服 地方 產品 電器 購物 師傅 正品 小哥 評差
Topic #1:
速度 物流 品牌 熱水 遙控器 實惠 商品 客服 地方 產品 電器 購物 師傅 小哥 正品 評差 檢查 漏電 麻煩 父母
Topic #2:
服務 東西 遙控 專業 熱情 商城 寶貝 關鍵 信任 商品 客服 地方 產品 電器 購物 師傅 正品 小哥 評差 檢查
Topic #3:
師傅 客服 產品 購物 商品 電器 地方 小哥 正品 評差 檢查 漏電 麻煩 父母 消費者 理由 機子 塊錢 架子 螺絲
Topic #4:
電話 配件 燒水 評價 客戶 牌子 辦法 角閥 差評 老人 海爾 大氣 老家 花錢 花灑 水壓 消費 生氣 無線 農村
Topic #5:
質量 材料 垃圾 機器 公司 廠家 性價比 亂收費 回家 標準 玩意 五星 技術 太陽能 賣家 貨物 諮詢 客服 商品 地方
Topic #6:
價格 實體店 水管 坑人 朋友 降價 外觀 支架 保溫 網購 總體 活動 水閥 評論 家人 商家 圖片 細心 價格公道 商品
Topic #7:
售後 人員 免費 塊錢 情況 消費者 螺絲 理由 架子 評差 父母 機子 衛生間 功能 整體 麻煩 漏電 檢查 客服 商品
Topic #8:
態度 小時 感覺 下單 安裝工 自營 樣子 美觀 品質 閥門 太坑 容量 收貨 商品 客服 地方 產品 電器 購物 師傅
Topic #9:
安裝費 收費 費用 漏水 效果 發貨 體驗 管件 溫度 太貴 插座 過程 顧客 噁心 不合理 力給力 冷水 客服 商品 地方

pyLDAvis可視化import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)

data = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
pyLDAvis.show(data)

pyLDAvis 圖是動態圖，會根據選擇現實高頻詞彙。


建立詞典、語料庫（區分正面/負面評價）# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import re
import itertools
import matplotlib.pyplot as plt

# 載入情感分析後的數據
posdata = pd.read_csv("/posdata.csv", encoding = 'utf-8')
negdata = pd.read_csv("/negdata.csv", encoding = 'utf-8')

from gensim import corpora, models
# 建立詞典
pos_dict = corpora.Dictionary([[i] for i in posdata['word']])  # 正面
neg_dict = corpora.Dictionary([[i] for i in negdata['word']])  # 負面

# 建立語料庫，生成詞袋向量
pos_corpus = [pos_dict.doc2bow(j) for j in [[i] for i in posdata['word']]]  # 正面
neg_corpus = [neg_dict.doc2bow(j) for j in [[i] for i in negdata['word']]]   # 負面

LDA主題分析（正面評價/負面評價）# LDA主題分析
# 正面評價主題分析
pos_lda = models.LdaModel(pos_corpus, num_topics = 3, id2word = pos_dict)  
neg_lda = models.LdaModel(neg_corpus, num_topics = 3, id2word = neg_dict)  

pos_lda.print_topics(num_words = 10)
[(0,
  '0.124*"安裝" + 0.041*"師傅" + 0.030*"送貨" + 0.025*"很快" + 0.018*"售後" + 0.014*"送" + 0.012*"家裡" + 0.010*"贊" + 0.010*"收" + 0.009*"快遞"'),
 (1,
  '0.028*"不錯" + 0.022*"客服" + 0.020*"東西" + 0.017*"物流" + 0.016*"購物" + 0.016*"太" + 0.014*"速度" + 0.014*"電話" + 0.014*"品牌" + 0.011*"小時"'),
 (2,
  '0.058*"滿意" + 0.029*"服務" + 0.025*"值得" + 0.023*"好評" + 0.019*"信賴" + 0.018*"人員" + 0.016*"差" + 0.013*"裝" + 0.012*"質量" + 0.012*"真的"')]
# 負面評價主題分析
neg_lda.print_topics(num_words = 10)
[(0,
  '0.021*"東西" + 0.017*"收" + 0.015*"加熱" + 0.014*"漏水" + 0.012*"坑人" + 0.012*"真的" + 0.011*"坑" + 0.011*"服務" + 0.011*"問" + 0.009*"免費"'),
 (1,
  '0.134*"安裝" + 0.033*"垃圾" + 0.033*"師傅" + 0.027*"差" + 0.019*"裝" + 0.019*"小時" + 0.018*"不好" + 0.017*"貴" + 0.016*"慢" + 0.016*"燒水"'),
 (2,
  '0.032*"售後" + 0.032*"太" + 0.026*"安裝費" + 0.021*"客服" + 0.018*"收費" + 0.018*"打電話" + 0.014*"人員" + 0.012*"材料" + 0.012*"配件" + 0.011*"上門"')]
關注公眾號，在對話框回復「10193」獲取源碼 + 數據集。
商品評論情感化分析案例(LDA主題分析)

相關焦點

運用sklearn進行線性判別分析(LDA)代碼實現

數據挖掘之--LDA主題建模

案例分析:一文掌握商品庫存分析思路

在Netflix 評論中做情感分析的深度學習模型

中文情感分析之TextCNN

在 Netflix 評論中做情感分析的深度學習模型 | 洞見

《精神分析案例解析導論》

文本挖掘:避孕藥主題情感分析

文本情感分析:讓機器讀懂人類情感

人工智慧技術落地:情感分析概述

Python爬蟲與文本分析應用案例研討會

電商類APP系列2:商品卡片競品分析

手把手教你學會LDA話題模型可視化pyLDAvis庫

小白數據分析——Python職位數據分析全鏈路

人民日報新媒體新聞評論話語分析

線性判別分析(LDA)及其在R中實現

在運營中,為什麼文本分析遠比數值型分析重要?一個實際案例,五點...

社交網絡分析/京東surface5評論數據分析/詞雲圖/情感分析/網絡結構圖Python陳金文老師手把手教學

商品管理常用案例分析試題

☞【案例】大數據攻略案例分析及結論