本程序修改并集成自其它的两个人的源码,功能是:
1、自动从一大段文本中提取可能组成的词语。
2、通过输入多个文档,通过tf-idf算法自动提取各种文档的关键字。
3、通过提取关键字可以得到不同的QQ群讨论的主要内容。
关于第一条的原理说明请参考《互联网时代的社会语言学:基于SNS的文本数据挖掘》https://www.matrix67.com/blog/archives/5044
wordmaker.py
# -*- coding=utf-8 -*- import re import collections import math # modify from https://gist.github.com/lastland/3322018 def info_entropy(words): result = 0 total = sum([val for _, val in words.iteritems()]) for word, cnt in words.iteritems(): p = float(cnt) / total result -= p * math.log(p) return result max_word_len = 5 entropy_threshold = 1 def make_words(content, filename): sentences = re.split("\W+|[a-zA-Z0-9]+", content, 0, re.UNICODE) #sentences = re.split(ur"[a-zA-Z0-9^\u4E00-\u9FA5]+", content, 0, re.UNICODE) freq = collections.Counter() for sentence in sentences: if sentence: l = len(sentence) wl = min(l, max_word_len) for i in range(1, wl + 1): for j in range(0, l - i + 1): freq[sentence[j:j + i]] += 1 total = sum([val for _, val in freq.iteritems()]) ps = collections.defaultdict(int) for word, val in freq.iteritems(): ps[word] = float(val) / total words = set() for word, word_p in ps.items(): if len(word) > 1: p = 0 for i in range(1, len(word)): t = ps[word[0:i]] * ps[word[i:]] p = max(p, t) if freq[word] >= 3 and word_p / p > 100: words.add(word) final_words = set() for word in words: lf = rf = True left_words = collections.Counter() right_words = collections.Counter() pattern = re.compile(word.join(['.?', '.?'])) for sentence in sentences: l = pattern.findall(sentence) if l: if l[0][0] != word[0]: left_words[l[0][0]] += 1 else: lf = False if l[0][-1] != word[-1]: right_words[l[0][-1]] += 1 else: rf = False left_info_entropy = info_entropy(left_words) right_info_entropy = info_entropy(right_words) if lf and len(left_words) > 0 and left_info_entropy < entropy_threshold: continue if rf and len(right_words) > 0 and right_info_entropy < entropy_threshold: continue final_words.add(word) words_list = list(final_words) words_list.sort(cmp = lambda x, y: cmp(freq[y], freq[x])) final_freq = collections.Counter() file = open(filename, 'w') #for word,v in final_freq.iteritems(): #for word in final_words: for word in words_list: v = freq[word] file.write("%s %d\n" % (word,v)) final_freq[word] = v file.close() return final_freq
tfidf.py
#!/usr/bin/env python # # Copyright 2009 Niniane Wang (niniane@gmail.com) # Reviewed by Alex Mendes da Costa. # # This is a simple Tf-idf library. The algorithm is described in # https://en.wikipedia.org/wiki/Tf-idf # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 3 of the License, or (at your option) any later version. # # Tfidf is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details: # # https://www.gnu.org/licenses/lgpl.txt __author__ = "Niniane Wang" __email__ = "niniane at gmail dot com" import math import re from operator import itemgetter import collections import codecs #modify from https://code.google.com/p/tfidf/ class TfIdf: """Tf-idf class implementing https://en.wikipedia.org/wiki/Tf-idf. The library constructs an IDF corpus and stopword list either from documents specified by the client, or by reading from input files. It computes IDF for a specified term based on the corpus, or generates keywords ordered by tf-idf for a specified document. """ def __init__(self, corpus_filename = None, stopword_filename = None, DEFAULT_IDF = 1.5): """Initialize the idf dictionary. If a corpus file is supplied, reads the idf dictionary from it, in the format of: # of total documents term: # of documents containing the term If a stopword file is specified, reads the stopword list from it, in the format of one stopword per line. The DEFAULT_IDF value is returned when a query term is not found in the idf corpus. """ self.num_docs = 0 self.term_num_docs = {} # term : num_docs_containing_term self.stopwords = [] self.idf_default = DEFAULT_IDF if corpus_filename: corpus_file = open(corpus_filename, "r") # Load number of documents. line = corpus_file.readline() self.num_docs = int(line.strip()) # Reads "term:frequency" from each subsequent line in the file. for line in corpus_file: tokens = line.split(" ") term = tokens[0].strip() frequency = int(tokens[1].strip()) self.term_num_docs[term] = frequency if stopword_filename: stopword_file = open(stopword_filename, "r") self.stopwords = [line.strip() for line in stopword_file] def get_tokens(self, str): """Break a string into tokens, preserving URL tags as an entire token. This implementation does not preserve case. Clients may wish to override this behavior with their own tokenization. """ return re.findall(r"<a.*?/a>|<[^\>]*>|[\w'@#]+", str.lower()) def add_input_document(self, input): """Add terms in the specified document to the idf dictionary.""" self.num_docs += 1 words = set(self.get_tokens(input)) for word in words: if word in self.term_num_docs: self.term_num_docs[word] += 1 else: self.term_num_docs[word] = 1 def add_input_corpus(self, corpus_filename): if corpus_filename: corpus_file = codecs.open(corpus_filename, "r", "utf-8") # Load number of documents. self.num_docs += 1 # Reads "term:frequency" from each subsequent line in the file. for line in corpus_file: tokens = line.split(" ") term = tokens[0].strip() frequency = int(tokens[1].strip()) if term in self.term_num_docs: self.term_num_docs[term] += 1 else: self.term_num_docs[term] = 1 def save_corpus_to_file(self, idf_filename, stopword_filename, STOPWORD_PERCENTAGE_THRESHOLD = 0.01): """Save the idf dictionary and stopword list to the specified file.""" output_file = open(idf_filename, "w") output_file.write(str(self.num_docs) + "\n") for term, num_docs in self.term_num_docs.items(): output_file.write(term + ": " + str(num_docs) + "\n") sorted_terms = sorted(self.term_num_docs.items(), key=itemgetter(1), reverse=True) stopword_file = open(stopword_filename, "w") for term, num_docs in sorted_terms: if num_docs < STOPWORD_PERCENTAGE_THRESHOLD * self.num_docs: break stopword_file.write(term + "\n") def get_num_docs(self): """Return the total number of documents in the IDF corpus.""" return self.num_docs def get_idf(self, term): """Retrieve the IDF for the specified term. This is computed by taking the logarithm of ( (number of documents in corpus) divided by (number of documents containing this term) ). """ if term in self.stopwords: return 0 if not term in self.term_num_docs: return self.idf_default return math.log(float(1 + self.get_num_docs()) / (1 + self.term_num_docs[term])) def get_doc_keywords(self, curr_doc): """Retrieve terms and corresponding tf-idf for the specified document. The returned terms are ordered by decreasing tf-idf. """ tfidf = {} tokens = self.get_tokens(curr_doc) tokens_set = set(tokens) for word in tokens_set: mytf = float(tokens.count(word)) / len(tokens_set) myidf = self.get_idf(word) tfidf[word] = mytf * myidf return sorted(tfidf.items(), key=itemgetter(1), reverse=True) def get_corpus_keywords(self, corpus_filename): tfidf = {} tokens_set = collections.Counter() tokens_len = 0 if corpus_filename: corpus_file = codecs.open(corpus_filename, "r", "utf-8") for line in corpus_file: tokens = line.split(" ") term = tokens[0].strip() frequency = int(tokens[1].strip()) tokens_set[term] = frequency tokens_len += 1 for word,v in tokens_set.iteritems(): mytf = float(v) / tokens_len myidf = self.get_idf(word) tfidf[word] = mytf * myidf return sorted(tfidf.items(), key=itemgetter(1), reverse=True)
test_idf.py
# -*- coding=utf-8 -*- import sys, os.path reload(sys) sys.setdefaultencoding('utf-8') import wordmaker import tfidf import re import codecs files = ['qq0','qq1', 'qq2', 'qq3', 'qq4'] accepted_chars = re.compile(ur"\d+-\d+-\d+") for filename in files: file = codecs.open(filename, 'r', 'utf-8') content = '' for line in file.readlines(): if not accepted_chars.match(line): content += line file.close() #print type(content) final_freq = wordmaker.make_words(content, 'name_'+filename) #file = open('name_'+filename, 'w') #for word,v in final_freq.iteritems(): # file.write("%s %d\n" % (word,v)) #file.close() my_tfidf = tfidf.TfIdf() for n in files: my_tfidf.add_input_corpus('name_'+n) keywords = my_tfidf.get_corpus_keywords('name_'+files[1]) cnt = 0 for k,v in keywords: print k, v cnt += 1 if cnt > 20: break
本文来自【余争】大神
三天不来手痒痒!
相当不错,自愧不如!
访问您的博客已成习惯!