lda主题模型主题个数python（lda主题模型 python）

大家好，我是讯享网，很高兴认识大家。

import pandas as pd import numpy as np import re import nltk #pip install nltk

讯享网

corpus = [‘The sky is blue and beautiful.’,

讯享网 </span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">Love this blue and beautiful sky!</span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">The quick brown fox jumps over the lazy dog.</span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">The brown fox is quick and the blue dog is lazy!</span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">The sky is very blue and the sky is very beautiful today</span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">The dog is lazy but the brown fox is quick!</span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(0, 0, 0, 1)">

]

labels = [‘weather’, ‘weather’, ‘animals’, ‘animals’, ‘weather’, ‘animals’]

# 第一步：构建DataFrame格式数据 corpus = np.array(corpus) corpus_df = pd.DataFrame({‘Document’: corpus, ‘categoray’: labels})

# 第二步：构建函数进行分词和停用词的去除 # 载入英文的停用词表 stopwords = nltk.corpus.stopwords.words(‘english’) # 建立词分割模型 cut_model = nltk.WordPunctTokenizer() # 定义分词和停用词去除的函数 def Normalize_corpus(doc):

讯享网

</span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> 去除字符串中结尾的标点符号</span> doc = re.sub(r<span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">[^a-zA-Z0-9s]</span><span style="color: rgba(128, 0, 0, 1)">'</span>, <span style="color: rgba(128, 0, 0, 1)">''</span>, string=<span style="color: rgba(0, 0, 0, 1)">doc) </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> 是字符串变小写格式</span> doc =<span style="color: rgba(0, 0, 0, 1)"> doc.lower() </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> 去除字符串两边的空格</span> doc =<span style="color: rgba(0, 0, 0, 1)"> doc.strip() </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> 进行分词操作</span> tokens =<span style="color: rgba(0, 0, 0, 1)"> cut_model.tokenize(doc) </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> 使用停止用词表去除停用词</span> doc = [token <span style="color: rgba(0, 0, 255, 1)">for</span> token <span style="color: rgba(0, 0, 255, 1)">in</span> tokens <span style="color: rgba(0, 0, 255, 1)">if</span> token <span style="color: rgba(0, 0, 255, 1)">not</span> <span style="color: rgba(0, 0, 255, 1)">in</span><span style="color: rgba(0, 0, 0, 1)"> stopwords] </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> 将去除停用词后的字符串使用' '连接，为了接下来的词袋模型做准备</span> doc = <span style="color: rgba(128, 0, 0, 1)">'</span> <span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(0, 0, 0, 1)">.join(doc) </span><span style="color: rgba(0, 0, 255, 1)">return</span><span style="color: rgba(0, 0, 0, 1)"> doc

# 第三步：向量化函数和调用函数 # 向量化函数,当输入一个列表时，列表里的数将被一个一个输入，最后返回也是一个个列表的输出 Normalize_corpus = np.vectorize(Normalize_corpus) # 调用函数进行分词和去除停用词 corpus_norm = Normalize_corpus(corpus)

# 第四步：使用TfidVectorizer进行TF-idf词袋模型的构建 from sklearn.feature_extraction.text import TfidfVectorizer

Tf = TfidfVectorizer(use_idf=True) Tf.fit(corpus_norm) vocs = Tf.get_feature_names() corpus_array = Tf.transform(corpus_norm).toarray() corpus_norm_df = pd.DataFrame(corpus_array, columns=vocs) print(corpus_norm_df.head())

# 第五步：构建LDA主题模型 from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_topics=2, max_iter=100, random_state=42) LDA_corpus = np.array(LDA.fit_transform(corpus_array)) LDA_corpus_one = np.zeros([LDA_corpus.shape[0]]) LDA_corpus_one[LDA_corpus[:, 0] < LDA_corpus[:, 1]] = 1 corpus_norm_df[‘LDA_labels’] = LDA_corpus_one print(corpus_norm_df.head())

lda主题模型主题个数python（lda主题模型 python）

相关推荐