1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
|
import numpy as np import pandas as pd import re import jieba
from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB,BernoulliNB
stopword_path = "cn_stopwords.txt"
def load_stopwords(file_path): stop_words = [] with open(file_path, encoding='UTF-8') as words: stop_words.extend([i.strip() for i in words.readlines()]) return stop_words
def clearContentWithSpecialCharacter(content): content = content.replace("{%", "1") content = content.replace("%}", "1") pattern = re.compile(r'(1)(.*)(1)') return pattern.sub(r'', content)
def review_to_text(review): stop_words = load_stopwords(stopword_path)
review = re.sub("(@[a-zA-Z0-9_\u4E00-\u9FA5]+)", '', review)
review = re.sub("[^\u4e00-\u9fa5^a-z^A-Z]", '', review)
review_cut = jieba.cut(review)
if stop_words: all_stop_words = set(stop_words) words = [w for w in review_cut if w not in all_stop_words] return words
def load_curpus(path): data = [] with open(path,"r",encoding="utf-8") as f: for line in f: [_,sentiment,comment] = line.split(",",2) data.append((comment,int(sentiment))) return data
def pro_curpus(path): data = [] with open(path,"r",encoding="utf-8") as f: for line in f: [_,sentiment,comment] = line.split(",",2) content = review_to_text(comment) data.append((content,int(sentiment))) return data
train_data = load_curpus("train.txt") test_data = load_curpus("test.txt")
train_data_pro = pro_curpus("train.txt") test_data_pro = pro_curpus("test.txt")
train_df = pd.DataFrame(train_data_pro,columns=['content','sentiment']) test_df = pd.DataFrame(test_data_pro,columns=['content','sentiment'])
data_str = [' '.join(content) for content,sentiment in train_data_pro ] + \ [' '.join(content) for content,sentiment in test_data_pro ]
vect = CountVectorizer(max_df = 0.8,min_df = 5)
nb = BernoulliNB()
vect.fit_transform(data_str)
X_data, y_data = [], [] for content, sentiment in train_data_pro: X_data.append(" ".join(content)) y_data.append(sentiment) X_train = vect.transform(X_data) y_train = y_data
X_data2,y_data2 =[],[] for content,sentiment in test_data_pro: X_data2.append(" ".join(content)) y_data2.append(sentiment) X_test = vect.transform(X_data2) y_test = y_data2
nb.fit(X_train,y_train) train_score = nb.score(X_train,y_train) print(train_score)
bad_cases = [] for i in range(X_train.shape[0]): if(nb.predict(X_train[i])!=y_train[i]):
print("[%s],[%s],[真实:%s],[判断:%s]" %(train_data[i],X_data[i],y_train[i],nb.predict(X_train[i])))
test_score = nb.score(X_test,y_test) print(test_score)
from sklearn import metrics result = nb.predict(X_test) print(metrics.classification_report(y_test, result))
bad_cases = [] for i in range(result.shape[0]): if(result[i]!=y_test[i]): print("[%s],[真实:%s],[判断:%s]" %(train_df['content'][i],y_test[i],result[i]))
|