关键词提取(TF-IDF & TextRank)

一、代码实现如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import jieba

import jieba.posseg

import numpy as np

import pandas as pd

import math

import operator



text = '广州地铁集团工会主席钟学军在开幕式上表示,在交通强国战略的指引下,我国城市轨道' \

       '交通事业蓬勃发展,城轨线路运营里程不断增长,目前,全国城市轨道交通线网总里程' \

       '接近5000公里,每天客运量超过5000万人次。城市轨道交通是高新技术密集型行业,' \

       '几十个专业纷繁复杂,几十万台(套)设备必须安全可靠,线网调度必须联动周密,' \

       '列车运行必须精准分秒不差。城市轨道交通又是人员密集型行业,产业工人素质的好坏、' \

       '高低,直接与人民生命安全息息相关。本届“国赛”选取的列车司机和行车值班员,' \

       '正是行业安全运营的核心、关键工种。开展职业技能大赛的目的,就是要弘扬' \

       '“工匠精神”,在行业内形成“比、学、赶、帮、超”的良好氛围,在校园里掀起' \

       '“学本领、争上游”的学习热潮,共同为我国城市轨道交通的高质量发展和交通强国' \

       '建设目标的全面实现做出应有的贡献。'
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def Stop_words():

    stopword = []

    data = []

    f = open('./data/stopword.txt', encoding='utf8')

    for line in f.readlines():

        data.append(line)

    for i in data:

        output = str(i).replace('\n', '')

        stopword.append(output)

    return stopword



# 采用jieba进行词性标注,对当前文档过滤词性和停用词

def Filter_word(text):

    filter_word = [] # 保存候选关键词列表

    stopword = Stop_words()

    #完成文本text的词性标注

    text = jieba.posseg.cut(text)

    for word, flag in text:

        if flag.startswith('n') is False:

            continue

        if not word in stopword and len(word) > 1:

            filter_word.append(word)

    return filter_word

# 加载文档集,对文档集过滤词性和停用词

def Filter_words(data_path = './data/corpus.txt'):

    document = []

    for line in open(data_path, 'r', encoding='utf8'):

        segment = jieba.posseg.cut(line.strip())

        filter_words = []

        stopword = Stop_words()

        for word, flag in segment:

            if flag.startswith('n') is False:

                continue

            if not word in stopword and len(word) > 1:

                filter_words.append(word)

        document.append(filter_words)

    return document
1
2
3
a = [x + 2 for x in range(10)]

a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def tf_idf():

    # 统计TF值

    tf_dict = {} #保存候选关键词的tf值

    filter_word = Filter_word(text)

    for word in filter_word:

        tfreq = tf_dict.get(word)

        freq = tfreq + 1 if tfreq else 1

        tf_dict[word] = freq

    tf_dict = {word:freq/len(text) for word, freq in tf_dict.items()}

    '''

    for word in filter_word:

        if word not in tf_dict:

            tf_dict[word] = 1

        else:

            tf_dict[word] += 1

    for word in tf_dict:

        tf_dict[word] = tf_dict[word] / len(text)

    '''

    # 统计IDF值

    idf_dict = {} #用于保存候选关键词对应的逆文档频率

    document = Filter_words()

    doc_total = len(document)

    for doc in document:

        for word in set(doc):

            #这里统计的是包含这个候选关键词的文档数

            if word not in idf_dict:

                idf_dict[word] = 1

            else:

                idf_dict[word] += 1

    for word in idf_dict:

        idf_dict[word] = math.log(doc_total / (idf_dict[word] + 1))

    # 计算TF-IDF值

    tf_idf_dict = {}

    for word in filter_word:

        #因为有些词只在text中出现,文档集并没有

        #为了防止出现keyerror,提前把这部分词对应的逆文档频率设置为0

        if word not in idf_dict:

            idf_dict[word] = 0

        tf_idf_dict[word] = tf_dict[word] * idf_dict[word]

    # 提取前10个关键词

    keyword = 10

    print('TF-IDF模型结果:')

    #operator.itemgetter函数获取的不是值,而是一个函数

    #通过这个函数获取到tf_idf_dict.items()第二个域的值

    for key, value in sorted(tf_idf_dict.items(), key=operator.itemgetter(1),

                             reverse=True)[:keyword]:

        print(key + '/', end='')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
def tf_idf1():

    # 统计TF值

    tf_dict = {}

    filter_word = Filter_word(text)

    for word in filter_word:

        tfreq = tf_dict.get(word)

        freq = tfreq + 1 if tfreq else 1

        tf_dict[word] = freq

    tf_dict = {word:freq/len(text) for word, freq in tf_dict.items()}

    # 统计IDF值

    idf_dict = {}

    document = Filter_words()

    doc_total = len(document)

    for word in tf_dict.keys():

        flag = False

        for doc in document:

            if word in set(doc):

                flag = True

                tfreq = idf_dict.get(word)

                freq = tfreq + 1 if tfreq else 1

                idf_dict[word] = freq

        if not flag:

            idf_dict[word] = 0

    idf_dict = {word:math.log(doc_total/(freq + 1)) for word, freq in idf_dict.items()}

    # 计算TF-IDF值

    tf_idf_dict = {}

    print(len(tf_dict.keys())) #53个

    print(len(idf_dict.keys())) #29个

    for word in idf_dict.keys():

        tf_idf_dict[word] = tf_dict[word] * idf_dict[word]

    # 提取前10个关键词

    keyword = 10

    print('TF-IDF模型结果:')

    #operator.itemgetter函数获取的不是值,而是定义了一个函数

    #通过该函数获取到tf_idf_dict.items()的第二个域的值。

    for key, value in sorted(tf_idf_dict.items(), key=operator.itemgetter(1),

                             reverse=True)[:keyword]:

        print(key + '/', end=' ')

    print('\n')
1
2
3
4
5
tf_idf()

print("/n")

tf_idf1()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def TextRank():

    window = 3

    #字典的key是候选关键词,value是这个关键词在无向加权图中的边

    win_dict = {}

    filter_word = Filter_word(text)

    print(filter_word)

    length = len(filter_word)

    # 构建每个节点的窗口集合

    for word in filter_word:

        index = filter_word.index(word)

       # 设置窗口左、右边界,控制边界范围

        if word not in win_dict:

            left = index - window + 1

            right = index + window

            if left < 0:

                left = 0

            if right >= length:

                right = length

            words = set()

            #print('start')

            for i in range(left, right):

                if i == index:

                    continue

                words.add(filter_word[i])

                win_dict[word] = words

            #print(win_dict)

            #print('end')

    # 构建相连的边的关系矩阵

    word_dict = list(set(filter_word))

    lengths = len(set(filter_word))

    matrix = pd.DataFrame(np.zeros([lengths,lengths]))

    for word in win_dict:

        for value in win_dict[word]:

            index1 = word_dict.index(word)

            index2 = word_dict.index(value)

            #因为是无向图,所以矩阵是对称的

            matrix.iloc[index1, index2] = 1

            matrix.iloc[index2, index1] = 1

    summ = 0

    cols = matrix.shape[1]

    rows = matrix.shape[0]

    # 归一化矩阵

    for j in range(cols):

        for i in range(rows):

            summ += matrix.iloc[i, j]

        matrix[j] /= summ

    # 根据公式计算textrank值

    d = 0.85

    iter_num = 700

    word_textrank = {}

    textrank = np.ones([lengths, 1])

    for i in range(iter_num):

        textrank = (1 - d) + d * np.dot(matrix, textrank)

    # 将词语和textrank值一一对应

    for i in range(len(textrank)):

        word = word_dict[i]

        word_textrank[word] = textrank[i, 0]

    keyword = 10

    print('------------------------------')

    print('textrank模型结果:')

    for key, value in sorted(word_textrank.items(), key=operator.itemgetter(1),

                             reverse=True)[:keyword]:

        print(key + '/', end='')
1
TextRank()

二、尾巴