自动生成词,并通过tf-idf算法分析不同文本的不同

本程序修改并集成自其它的两个人的源码,功能是:
1、自动从一大段文本中提取可能组成的词语。
2、通过输入多个文档,通过tf-idf算法自动提取各种文档的关键字。
3、通过提取关键字可以得到不同的QQ群讨论的主要内容。

关于第一条的原理说明请参考《互联网时代的社会语言学:基于SNS的文本数据挖掘》http://www.matrix67.com/blog/archives/5044

wordmaker.py

# -*- coding=utf-8 -*-
import re
import collections
import math
 
# modify from https://gist.github.com/lastland/3322018
 
def info_entropy(words):
    result = 0
    total = sum([val for _, val in words.iteritems()])
    for word, cnt in words.iteritems():
        p = float(cnt) / total
        result -= p * math.log(p)
    return result
 
max_word_len = 5
entropy_threshold = 1
 
def make_words(content, filename):
    sentences = re.split("\W+|[a-zA-Z0-9]+", content, 0, re.UNICODE)
    #sentences = re.split(ur"[a-zA-Z0-9^\u4E00-\u9FA5]+", content, 0, re.UNICODE)
    freq = collections.Counter()
    for sentence in sentences:
        if sentence:
            l = len(sentence)
            wl = min(l, max_word_len)
            for i in range(1, wl + 1): 
                for j in range(0, l - i + 1): 
                    freq[sentence[j:j + i]] += 1
    total = sum([val for _, val in freq.iteritems()])
    ps = collections.defaultdict(int)
    for word, val in freq.iteritems():
        ps[word] = float(val) / total
 
    words = set()
    for word, word_p in ps.items():
        if len(word) > 1:
            p = 0
            for i in range(1, len(word)):
                t = ps[word[0:i]] * ps[word[i:]]
                p = max(p, t)
            if freq[word] >= 3 and word_p / p > 100:
                words.add(word)
 
    final_words = set()
    for word in words:
        lf = rf = True
        left_words = collections.Counter()
        right_words = collections.Counter()
        pattern = re.compile(word.join(['.?', '.?']))
        for sentence in sentences:
            l = pattern.findall(sentence)
            if l:
                if l[0][0] != word[0]:
                    left_words[l[0][0]] += 1
                else:
                    lf = False
                if l[0][-1] != word[-1]:
                    right_words[l[0][-1]] += 1
                else:
                    rf = False
        left_info_entropy = info_entropy(left_words)
        right_info_entropy = info_entropy(right_words)
        if lf and len(left_words) > 0 and left_info_entropy < entropy_threshold:
            continue
        if rf and len(right_words) > 0 and right_info_entropy < entropy_threshold:
            continue
        final_words.add(word)
    words_list = list(final_words)
    words_list.sort(cmp = lambda x, y: cmp(freq[y], freq[x]))
     
    final_freq = collections.Counter()
    file = open(filename, 'w')
 
    #for word,v in final_freq.iteritems():
    #for word in final_words:
    for word in words_list:
        v = freq[word]
        file.write("%s %d\n" % (word,v))
        final_freq[word] = v
 
    file.close()
 
    return final_freq

tfidf.py

#!/usr/bin/env python
# 
# Copyright 2009  Niniane Wang (niniane@gmail.com)
# Reviewed by Alex Mendes da Costa.
#
# This is a simple Tf-idf library.  The algorithm is described in
#   http://en.wikipedia.org/wiki/Tf-idf
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# Tfidf is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details:
#
#   http://www.gnu.org/licenses/lgpl.txt
 
__author__ = "Niniane Wang"
__email__ = "niniane at gmail dot com"
 
import math
import re
from operator import itemgetter
import collections
import codecs
 
#modify from http://code.google.com/p/tfidf/
 
class TfIdf:
 
  """Tf-idf class implementing http://en.wikipedia.org/wiki/Tf-idf.
   
     The library constructs an IDF corpus and stopword list either from
     documents specified by the client, or by reading from input files.  It
     computes IDF for a specified term based on the corpus, or generates
     keywords ordered by tf-idf for a specified document.
  """
 
  def __init__(self, corpus_filename = None, stopword_filename = None,
               DEFAULT_IDF = 1.5):
    """Initialize the idf dictionary.  
     
       If a corpus file is supplied, reads the idf dictionary from it, in the
       format of:
         # of total documents
         term: # of documents containing the term
 
       If a stopword file is specified, reads the stopword list from it, in
       the format of one stopword per line.
 
       The DEFAULT_IDF value is returned when a query term is not found in the
       idf corpus.
    """
    self.num_docs = 0
    self.term_num_docs = {}     # term : num_docs_containing_term
    self.stopwords = []
    self.idf_default = DEFAULT_IDF
 
    if corpus_filename:
      corpus_file = open(corpus_filename, "r")
 
      # Load number of documents.
      line = corpus_file.readline()
      self.num_docs = int(line.strip())
 
      # Reads "term:frequency" from each subsequent line in the file.
      for line in corpus_file:
       tokens = line.split(" ")
       term = tokens[0].strip()
       frequency = int(tokens[1].strip())
       self.term_num_docs[term] = frequency
 
    if stopword_filename:
      stopword_file = open(stopword_filename, "r")
      self.stopwords = [line.strip() for line in stopword_file]
 
  def get_tokens(self, str):
    """Break a string into tokens, preserving URL tags as an entire token.
 
       This implementation does not preserve case.  
       Clients may wish to override this behavior with their own tokenization.
    """
    return re.findall(r"<a.*?/a>|<[^\>]*>|[\w'@#]+", str.lower())
 
  def add_input_document(self, input):
    """Add terms in the specified document to the idf dictionary."""
    self.num_docs += 1
    words = set(self.get_tokens(input))
    for word in words:
      if word in self.term_num_docs:
        self.term_num_docs[word] += 1
      else:
        self.term_num_docs[word] = 1
  def add_input_corpus(self, corpus_filename):
    if corpus_filename:
      corpus_file = codecs.open(corpus_filename, "r", "utf-8")
 
      # Load number of documents.
      self.num_docs += 1
 
      # Reads "term:frequency" from each subsequent line in the file.
      for line in corpus_file:
       tokens = line.split(" ")
       term = tokens[0].strip()
       frequency = int(tokens[1].strip())
       if term in self.term_num_docs:
           self.term_num_docs[term] += 1
       else:
           self.term_num_docs[term] = 1
  def save_corpus_to_file(self, idf_filename, stopword_filename,
                          STOPWORD_PERCENTAGE_THRESHOLD = 0.01):
    """Save the idf dictionary and stopword list to the specified file."""
    output_file = open(idf_filename, "w")
 
    output_file.write(str(self.num_docs) + "\n")
    for term, num_docs in self.term_num_docs.items():
      output_file.write(term + ": " + str(num_docs) + "\n")
 
    sorted_terms = sorted(self.term_num_docs.items(), key=itemgetter(1),
                          reverse=True)
    stopword_file = open(stopword_filename, "w")
    for term, num_docs in sorted_terms:
      if num_docs < STOPWORD_PERCENTAGE_THRESHOLD * self.num_docs:
        break
 
      stopword_file.write(term + "\n")
 
  def get_num_docs(self):
    """Return the total number of documents in the IDF corpus."""
    return self.num_docs
 
  def get_idf(self, term):
    """Retrieve the IDF for the specified term. 
     
       This is computed by taking the logarithm of ( 
       (number of documents in corpus) divided by (number of documents
        containing this term) ).
     """
    if term in self.stopwords:
      return 0
 
    if not term in self.term_num_docs:
      return self.idf_default
 
    return math.log(float(1 + self.get_num_docs()) /
      (1 + self.term_num_docs[term]))
 
  def get_doc_keywords(self, curr_doc):
    """Retrieve terms and corresponding tf-idf for the specified document.
 
       The returned terms are ordered by decreasing tf-idf.
    """
    tfidf = {}
    tokens = self.get_tokens(curr_doc)
    tokens_set = set(tokens)
    for word in tokens_set:
      mytf = float(tokens.count(word)) / len(tokens_set)
      myidf = self.get_idf(word)
      tfidf[word] = mytf * myidf
 
    return sorted(tfidf.items(), key=itemgetter(1), reverse=True)
  def get_corpus_keywords(self, corpus_filename):
    tfidf = {}
    tokens_set = collections.Counter()
    tokens_len = 0
    if corpus_filename:
      corpus_file = codecs.open(corpus_filename, "r", "utf-8")
      for line in corpus_file:
        tokens = line.split(" ")
        term = tokens[0].strip()
        frequency = int(tokens[1].strip())
        tokens_set[term] = frequency
        tokens_len += 1
    for word,v in tokens_set.iteritems():
      mytf = float(v) / tokens_len
      myidf = self.get_idf(word)
      tfidf[word] = mytf * myidf
 
    return sorted(tfidf.items(), key=itemgetter(1), reverse=True)

test_idf.py

# -*- coding=utf-8 -*-
import sys, os.path
reload(sys) 
sys.setdefaultencoding('utf-8')
 
import wordmaker
import tfidf
import re
import codecs
 
files = ['qq0','qq1', 'qq2', 'qq3', 'qq4']
 
accepted_chars = re.compile(ur"\d+-\d+-\d+")
for filename in files:
    file = codecs.open(filename, 'r', 'utf-8')
    content = ''
    for line in file.readlines():
        if not accepted_chars.match(line):
            content += line
    file.close()
    #print type(content)
    final_freq = wordmaker.make_words(content, 'name_'+filename)
 
    #file = open('name_'+filename, 'w')
    #for word,v in final_freq.iteritems():
    #    file.write("%s %d\n" % (word,v))
    #file.close()
 
my_tfidf = tfidf.TfIdf()
for n in files:
    my_tfidf.add_input_corpus('name_'+n)
 
keywords = my_tfidf.get_corpus_keywords('name_'+files[1])
 
cnt = 0
for k,v in keywords:
    print k, v
    cnt += 1
    if cnt > 20:
        break

本文来自【余争】大神

自动生成词,并通过tf-idf算法分析不同文本的不同》上有3条评论

发表评论

电子邮件地址不会被公开。