关键词提取（TF-IDF & TextRank）

Giesen

2022-07-02

一、代码实现如下

import jieba

import jieba.posseg

import numpy as np

import pandas as pd

import math

import operator

  

text = '广州地铁集团工会主席钟学军在开幕式上表示，在交通强国战略的指引下，我国城市轨道' \

       '交通事业蓬勃发展，城轨线路运营里程不断增长，目前，全国城市轨道交通线网总里程' \

       '接近5000公里，每天客运量超过5000万人次。城市轨道交通是高新技术密集型行业，' \

       '几十个专业纷繁复杂，几十万台（套）设备必须安全可靠，线网调度必须联动周密，' \

       '列车运行必须精准分秒不差。城市轨道交通又是人员密集型行业，产业工人素质的好坏、' \

       '高低，直接与人民生命安全息息相关。本届“国赛”选取的列车司机和行车值班员，' \

       '正是行业安全运营的核心、关键工种。开展职业技能大赛的目的，就是要弘扬' \

       '“工匠精神”，在行业内形成“比、学、赶、帮、超”的良好氛围，在校园里掀起' \

       '“学本领、争上游”的学习热潮，共同为我国城市轨道交通的高质量发展和交通强国' \

       '建设目标的全面实现做出应有的贡献。'

def Stop_words():

    stopword = []

    data = []

    f = open('./data/stopword.txt', encoding='utf8')

    for line in f.readlines():

        data.append(line)

    for i in data:

        output = str(i).replace('\n', '')

        stopword.append(output)

    return stopword

  

# 采用jieba进行词性标注，对当前文档过滤词性和停用词

def Filter_word(text):

    filter_word = [] # 保存候选关键词列表

    stopword = Stop_words()

    #完成文本text的词性标注

    text = jieba.posseg.cut(text)

    for word, flag in text:

        if flag.startswith('n') is False:

            continue

        if not word in stopword and len(word) > 1:

            filter_word.append(word)

    return filter_word

# 加载文档集，对文档集过滤词性和停用词

def Filter_words(data_path = './data/corpus.txt'):

    document = []

    for line in open(data_path, 'r', encoding='utf8'):

        segment = jieba.posseg.cut(line.strip())

        filter_words = []

        stopword = Stop_words()

        for word, flag in segment:

            if flag.startswith('n') is False:

                continue

            if not word in stopword and len(word) > 1:

                filter_words.append(word)

        document.append(filter_words)

    return document

1
2
3

a = [x + 2 for x in range(10)]

a

def tf_idf():

    # 统计TF值

    tf_dict = {} #保存候选关键词的tf值

    filter_word = Filter_word(text)

    for word in filter_word:

        tfreq = tf_dict.get(word)

        freq = tfreq + 1 if tfreq else 1

        tf_dict[word] = freq

    tf_dict = {word:freq/len(text) for word, freq in tf_dict.items()}

    '''

    for word in filter_word:

        if word not in tf_dict:

            tf_dict[word] = 1

        else:

            tf_dict[word] += 1

    for word in tf_dict:

        tf_dict[word] = tf_dict[word] / len(text)

    '''

    # 统计IDF值

    idf_dict = {} #用于保存候选关键词对应的逆文档频率

    document = Filter_words()

    doc_total = len(document)

    for doc in document:

        for word in set(doc):

            #这里统计的是包含这个候选关键词的文档数

            if word not in idf_dict:

                idf_dict[word] = 1

            else:

                idf_dict[word] += 1

    for word in idf_dict:

        idf_dict[word] = math.log(doc_total / (idf_dict[word] + 1))

    # 计算TF-IDF值

    tf_idf_dict = {}

    for word in filter_word:

        #因为有些词只在text中出现，文档集并没有

        #为了防止出现keyerror，提前把这部分词对应的逆文档频率设置为0

        if word not in idf_dict:

            idf_dict[word] = 0

        tf_idf_dict[word] = tf_dict[word] * idf_dict[word]

    # 提取前10个关键词

    keyword = 10

    print('TF-IDF模型结果：')

    #operator.itemgetter函数获取的不是值，而是一个函数

    #通过这个函数获取到tf_idf_dict.items()第二个域的值

    for key, value in sorted(tf_idf_dict.items(), key=operator.itemgetter(1),

                             reverse=True)[:keyword]:

        print(key + '/', end='')

def tf_idf1():

    # 统计TF值

    tf_dict = {}

    filter_word = Filter_word(text)

    for word in filter_word:

        tfreq = tf_dict.get(word)

        freq = tfreq + 1 if tfreq else 1

        tf_dict[word] = freq

    tf_dict = {word:freq/len(text) for word, freq in tf_dict.items()}

    # 统计IDF值

    idf_dict = {}

    document = Filter_words()

    doc_total = len(document)

    for word in tf_dict.keys():

        flag = False

        for doc in document:

            if word in set(doc):

                flag = True

                tfreq = idf_dict.get(word)

                freq = tfreq + 1 if tfreq else 1

                idf_dict[word] = freq

        if not flag:

            idf_dict[word] = 0

    idf_dict = {word:math.log(doc_total/(freq + 1)) for word, freq in idf_dict.items()}

    # 计算TF-IDF值

    tf_idf_dict = {}

    print(len(tf_dict.keys())) #53个

    print(len(idf_dict.keys())) #29个

    for word in idf_dict.keys():

        tf_idf_dict[word] = tf_dict[word] * idf_dict[word]

    # 提取前10个关键词

    keyword = 10

    print('TF-IDF模型结果：')

    #operator.itemgetter函数获取的不是值，而是定义了一个函数

    #通过该函数获取到tf_idf_dict.items()的第二个域的值。

    for key, value in sorted(tf_idf_dict.items(), key=operator.itemgetter(1),

                             reverse=True)[:keyword]:

        print(key + '/', end=' ')

    print('\n')

tf_idf()

print("/n")

tf_idf1()

def TextRank():

    window = 3

    #字典的key是候选关键词，value是这个关键词在无向加权图中的边

    win_dict = {}

    filter_word = Filter_word(text)

    print(filter_word)

    length = len(filter_word)

    # 构建每个节点的窗口集合

    for word in filter_word:

        index = filter_word.index(word)

       # 设置窗口左、右边界，控制边界范围

        if word not in win_dict:

            left = index - window + 1

            right = index + window

            if left < 0:

                left = 0

            if right >= length:

                right = length

            words = set()

            #print('start')

            for i in range(left, right):

                if i == index:

                    continue

                words.add(filter_word[i])

                win_dict[word] = words

            #print(win_dict)

            #print('end')

    # 构建相连的边的关系矩阵

    word_dict = list(set(filter_word))

    lengths = len(set(filter_word))

    matrix = pd.DataFrame(np.zeros([lengths,lengths]))

    for word in win_dict:

        for value in win_dict[word]:

            index1 = word_dict.index(word)

            index2 = word_dict.index(value)

            #因为是无向图，所以矩阵是对称的

            matrix.iloc[index1, index2] = 1

            matrix.iloc[index2, index1] = 1

    summ = 0

    cols = matrix.shape[1]

    rows = matrix.shape[0]

    # 归一化矩阵

    for j in range(cols):

        for i in range(rows):

            summ += matrix.iloc[i, j]

        matrix[j] /= summ

    # 根据公式计算textrank值

    d = 0.85

    iter_num = 700

    word_textrank = {}

    textrank = np.ones([lengths, 1])

    for i in range(iter_num):

        textrank = (1 - d) + d * np.dot(matrix, textrank)

    # 将词语和textrank值一一对应

    for i in range(len(textrank)):

        word = word_dict[i]

        word_textrank[word] = textrank[i, 0]

    keyword = 10

    print('------------------------------')

    print('textrank模型结果：')

    for key, value in sorted(word_textrank.items(), key=operator.itemgetter(1),

                             reverse=True)[:keyword]:

        print(key + '/', end='')

1	TextRank()

一、代码实现如下

二、尾巴