install.packages("jiebaR")
library(jiebaR)
engine1 = worker()
words="我希望未來會很好"
engine1[words] #方法1
engine1<=words #方法2
segment(words",engine1) #方法3
[1] "我" "希望" "未來" "會" "很" "好"
library(jiebaR)
engine1 = worker()
segment['like.txt',engine1]
[1] "C:/Users/Li/Documents/RData/like.segment.2020-08-12_22_09_29.txt"
worker(type = "mix", dict = DICTPATH, hmm = HMMPATH, user = USERPATH,
idf = IDFPATH, stop_word = STOPPATH, write = T, qmax = 20, topn = 5,
encoding = "UTF-8", detect = T, symbol = F, lines = 1e+05,
output = NULL, bylines = F, user_weight = "max")
engine2 = worker("hmm") #創建時更改配置項
segment("./temp.txt", engine2)
engine2$write = T #通過$更改配置項
segment("./temp.txt", engine2)
engine3 = worker(type = "mix", dict = "dict_path",symbol = T) #指定系統詞典,並保留符號
segment("./temp.txt", engine3)
#關鍵詞模式,篩選1個關鍵詞
engine = worker("keywords", topn = 1) #關鍵詞模式,篩選1個關鍵詞
keywords(words, engine)
#標記模式,對分好後的每個詞標記詞性
tagger = worker("tag")
tagging("我愛廈門", tagger)
# r v ns
# "我" "愛" "廈門"
# r:代詞 v:動詞 ns:地名
### Simhash模型,篩選一個關鍵詞
simhasher = worker("simhash", topn = 1)
simhash(words, simhasher) #計算關鍵詞和原文的simhash值
distance("hello world" , "hello world!" , simhasher) #該函數使用Simhash引擎進行關鍵字提取,從兩個輸入中找到關鍵字,然後計算它們之間的Hamming距離。
engine=worker(user="mydict.utf8")
segment['like.txt',engine]
stop=c('r','p')
sentence=worker(type = "tag")
sentence_words<-sentence<="脫貧攻堅是我們黨對全國各族人民的莊嚴承諾,事關全面小康、家國夢想"
sentence_words[!(names(sentence_words) %in% stop)]#去除停止詞是代詞,介詞
v vn v n n l uj a v
"脫貧" "攻堅" "是" "黨" "全國" "各族人民" "的" "莊嚴" "承諾"
n n nr q n n
"事關" "全面" "小康" "家" "國" "夢想"
註:停止詞詞典txt文件一定要放在R語言的工作目錄下才行,而且txt文件的第一行需為空行否則無法導入停用詞典,當前工作目錄路徑採用getwd()得知。
sentence=worker(stop_word='stop.txt')
sentence_words<-sentence<="脫貧攻堅是我們黨對全國各族人民的莊嚴承諾,事關全面小康、家國夢想"
sentence_words
[1] "脫貧" "攻堅" "黨" "全國" "各族人民" "莊嚴" "承諾" "事關" "全面"
[10] "小康" "家" "國" "夢想"