import pandas as pd import numpy as np import re import nltk #pip install nltk
讯享网
corpus = [‘The sky is blue and beautiful.’,
讯享网 </span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">Love this blue and beautiful sky!</span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">The quick brown fox jumps over the lazy dog.</span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">The brown fox is quick and the blue dog is lazy!</span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">The sky is very blue and the sky is very beautiful today</span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">The dog is lazy but the brown fox is quick!</span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(0, 0, 0, 1)"> ]
labels = [‘weather’, ‘weather’, ‘animals’, ‘animals’, ‘weather’, ‘animals’]
# 第一步:构建DataFrame格式数据 corpus = np.array(corpus) corpus_df = pd.DataFrame({‘Document’: corpus, ‘categoray’: labels})
# 第二步:构建函数进行分词和停用词的去除 # 载入英文的停用词表 stopwords = nltk.corpus.stopwords.words(‘english’) # 建立词分割模型 cut_model = nltk.WordPunctTokenizer() # 定义分词和停用词去除的函数 def Normalize_corpus(doc):
</span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> 去除字符串中结尾的标点符号</span> doc = re.sub(r<span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">[^a-zA-Z0-9s]</span><span style="color: rgba(128, 0, 0, 1)">'</span>, <span style="color: rgba(128, 0, 0, 1)">''</span>, string=<span style="color: rgba(0, 0, 0, 1)">doc) </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> 是字符串变小写格式</span> doc =<span style="color: rgba(0, 0, 0, 1)"> doc.lower() </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> 去除字符串两边的空格</span> doc =<span style="color: rgba(0, 0, 0, 1)"> doc.strip() </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> 进行分词操作</span> tokens =<span style="color: rgba(0, 0, 0, 1)"> cut_model.tokenize(doc) </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> 使用停止用词表去除停用词</span> doc = [token <span style="color: rgba(0, 0, 255, 1)">for</span> token <span style="color: rgba(0, 0, 255, 1)">in</span> tokens <span style="color: rgba(0, 0, 255, 1)">if</span> token <span style="color: rgba(0, 0, 255, 1)">not</span> <span style="color: rgba(0, 0, 255, 1)">in</span><span style="color: rgba(0, 0, 0, 1)"> stopwords] </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> 将去除停用词后的字符串使用' '连接,为了接下来的词袋模型做准备</span> doc = <span style="color: rgba(128, 0, 0, 1)">'</span> <span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(0, 0, 0, 1)">.join(doc) </span><span style="color: rgba(0, 0, 255, 1)">return</span><span style="color: rgba(0, 0, 0, 1)"> doc # 第三步:向量化函数和调用函数 # 向量化函数,当输入一个列表时,列表里的数将被一个一个输入,最后返回也是一个个列表的输出 Normalize_corpus = np.vectorize(Normalize_corpus) # 调用函数进行分词和去除停用词 corpus_norm = Normalize_corpus(corpus)
# 第四步:使用TfidVectorizer进行TF-idf词袋模型的构建 from sklearn.feature_extraction.text import TfidfVectorizer
Tf = TfidfVectorizer(use_idf=True) Tf.fit(corpus_norm) vocs = Tf.get_feature_names() corpus_array = Tf.transform(corpus_norm).toarray() corpus_norm_df = pd.DataFrame(corpus_array, columns=vocs) print(corpus_norm_df.head())
# 第五步:构建LDA主题模型 from sklearn.decomposition import LatentDirichletAllocation
LDA = LatentDirichletAllocation(n_topics=2, max_iter=100, random_state=42) LDA_corpus = np.array(LDA.fit_transform(corpus_array)) LDA_corpus_one = np.zeros([LDA_corpus.shape[0]]) LDA_corpus_one[LDA_corpus[:, 0] < LDA_corpus[:, 1]] = 1 corpus_norm_df[‘LDA_labels’] = LDA_corpus_one print(corpus_norm_df.head())

版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容,请联系我们,一经查实,本站将立刻删除。
如需转载请保留出处:https://51itzy.com/kjqy/206507.html