-
Notifications
You must be signed in to change notification settings - Fork 2
/
freq.py
96 lines (65 loc) · 2.61 KB
/
freq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# Implementation from https://dev.to/davidisrawi/build-a-quick-summarizer-with-python-and-nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
def _create_frequency_table(text_string) -> dict:
stopWords = set(stopwords.words("english"))
words = word_tokenize(text_string)
ps = PorterStemmer()
freqTable = dict()
for word in words:
word = ps.stem(word)
if word in stopWords:
continue
if word in freqTable:
freqTable[word] += 1
else:
freqTable[word] = 1
return freqTable
def _score_sentences(sentences, freqTable) -> dict:
sentenceValue = dict()
for sentence in sentences:
word_count_in_sentence = (len(word_tokenize(sentence)))
word_count_in_sentence_except_stop_words = 0
for wordValue in freqTable:
if wordValue in sentence.lower():
word_count_in_sentence_except_stop_words += 1
if sentence[:10] in sentenceValue:
sentenceValue[sentence[:10]] += freqTable[wordValue]
else:
sentenceValue[sentence[:10]] = freqTable[wordValue]
if sentence[:10] in sentenceValue:
sentenceValue[sentence[:10]] = sentenceValue[sentence[:10]] / word_count_in_sentence_except_stop_words
return sentenceValue
def _find_average_score(sentenceValue) -> int:
sumValues = 0
for entry in sentenceValue:
sumValues += sentenceValue[entry]
# Average value of a sentence from original text
average = (int)(sumValues / len(sentenceValue))
return average
def _generate_summary(sentences, sentenceValue, threshold):
sentence_count = 0
summary = ''
for sentence in sentences:
if sentence[:10] in sentenceValue and sentenceValue[sentence[:10]] >= (threshold):
summary += " " + sentence
sentence_count += 1
return summary
def run_summarization(text):
# 1 Create the word frequency table
freq_table = _create_frequency_table(text)
# 2 Tokenize the sentences
sentences = sent_tokenize(text)
# 3 Important Algorithm: score the sentences
sentence_scores = _score_sentences(sentences, freq_table)
# 4 Find the threshold
threshold = _find_average_score(sentence_scores)
# 5 Important Algorithm: Generate the summary
summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold)
return summary
def main_freq():
with open('transcript.txt') as f:
text_str = f.read()
result = run_summarization(text_str)
return(result)