類似検索 - しげるメモ

勢いで類似検索と関連語検索などを作ってみた。Googleのようにキーワードを元に文書を探すのではなく、文書を元にほかの類似する文書を探してくる仕組み。関連語検索は、現在の文書に含まれないもののおそらく関連があると思われる単語を検出する。

詳しい説明は起きてから書くことにして、あまりに眠いのでプログラムと実行結果だけ。

全体で200行強くらい。3時間くらいでさくさくっと書いたので、怪しげなところはご指摘くださいませ。
使い方は、第1引数に検索クエリとして使うドキュメントの中身、第2引数以降に検索対象のドキュメントの中身。検索クエリに似たドキュメントを検索対象からtf*idfを使って探して、そのあとに関連語も探してくる。ちなみに、形態素解析してないので日本語だと無理。

import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Pattern;


public class TfIdf {
    
    static final Pattern DELIM = Pattern.compile("[^A-Za-z]+");
    
    public static void main(String[] args) {
        if (args.length < 2) {
            System.err.println("usage: java TfIdf <query-document> <target-document>+");
            throw new IllegalArgumentException(Arrays.toString(args));
        }
        String query = args[0];
        System.out.println("===== クエリドキュメント =====");
        System.out.println(query);
        System.out.println();
        
        List<String> docs = Arrays.asList(args).subList(1, args.length);
        System.out.println("===== 検索対象ドキュメント =====");
        printDocuments(docs);
        System.out.println();
        
        // ドキュメント一覧の辞書
        Map<String, Integer> dictionary = buildDictionary(docs);
        System.out.println("===== 辞書 =====");
        printDictionary(dictionary);
        System.out.println();
        
        // 出現頻度の行列
        double[][] tfMatrix = new double[docs.size()][];
        for (int i = 0, n = docs.size(); i < n; i++) {
            // ドキュメントごとに単語の出現頻度ベクトルを計算
            tfMatrix[i] = tf(docs.get(i), dictionary);
        }
        System.out.println("===== 単語出現頻度行列 =====");
        printMatrix(tfMatrix);
        System.out.println();
        
        // tf * idf の行列
        double[][] tfIdfMatrix = tfIdfMatrix(tfMatrix, dictionary);
        System.out.println("===== tf*idf 行列 =====");
        printMatrix(tfIdfMatrix);
        System.out.println();
        
        // 問い合わせドキュメントの出現頻度ベクトル
        double[] queryTf = tf(query, dictionary);
        
        // queryTf * tfIdfMatrix を計算すると、ドキュメントごとの類似度が出せる
        double[] simDocVector = new double[docs.size()];
        for (int d = 0, dSize = docs.size(); d < dSize; d++) {
            for (int w = 0, wSize = dictionary.size(); w < wSize; w++) {
                simDocVector[d] += queryTf[w] * tfIdfMatrix[d][w];
            }
        }
        
        // 表示してみる
        System.out.println("===== ドキュメント類似度 =====");
        for (int i: top3(simDocVector)) {
            if (i >= 0) {
                System.out.printf("doc#%02d=%-6.3f: %s%n", i, simDocVector[i], docs.get(i));
            }
        }
        System.out.println();
        
        // さらに、 simDocVector * (tfIfdMatrixの転置行列) で関連語も探せる
        double[] assocWordVector = new double[dictionary.size()];
        for (int w = 0, wSize = dictionary.size(); w < wSize; w++) {
            for (int d = 0, dSize = docs.size(); d < dSize; d++) {
                // 2008.06.10 10:20修正 
                // assocWordVector[w] += simDocVector[d] * tfIdfMatrix[d][w];
                // クエリに出現しない単語のみ評価
                if (queryTf[w] == 0) {
                    assocWordVector[w] += simDocVector[d] * tfIdfMatrix[d][w];
                }
            }
        }
        
        // 表示してみる
        String[] words = toArray(dictionary);
        System.out.println("===== 関連語 =====");
        for (int i: top3(assocWordVector)) {
            if (i >= 0) {
                System.out.printf("%s: %.3f%n", words[i], assocWordVector[i]);
            }
        }
        System.out.println();
    }

    // ドキュメントに含まれる単語の一覧を作る
    static Map<String, Integer> buildDictionary(List<String> docs) {
        Map<String, Integer> dictionary = new HashMap<String, Integer>();
        for (String doc: docs) {
            Scanner s = new Scanner(doc);
            s.useDelimiter(DELIM);
            while (s.hasNext()) {
                String word = s.next().toLowerCase();
                // 初めて見た単語であれば、登録して0から順にIDを振る
                if (!dictionary.containsKey(word)) {
                    dictionary.put(word, dictionary.size());
                }
            }
            s.close();
        }
        return dictionary;
    }
    
    // 単語の出現頻度(term frequency)を計算する
    // 単語出現頻度 = (ドキュメント内に単語が出現した回数) / (ドキュメントの単語数)
    static double[] tf(String doc, Map<String, Integer> dictionary) {
        // 単語の出現頻度ベクトル
        // word の出現頻度は vTf[ dictionary[ word ] ] に格納される
        double[] vTf = new double[dictionary.size()];
        Scanner s = new Scanner(doc);
        s.useDelimiter(DELIM);
        int total = 0;
        
        // まず、それぞれの単語の出現回数をカウントする
        while (s.hasNext()) {
            String word = s.next().toLowerCase();
            // 辞書に含まれるもののみカウント
            if (dictionary.containsKey(word)) {
                vTf[ dictionary.get(word) ]++;
            }
            total++;
        }
        s.close();
        
        // それぞれの要素を、全体の単語数で割る
        for (int i = 0; i < vTf.length; i++) {
            vTf[i] /= total;
        }
        return vTf;
    }
    
    // 出現頻度の行列の各要素に、逆出現頻度をかけたものを返す
    // 逆出現頻度 = (総ドキュメント数) / (単語を含むドキュメント数) ;; logをかけることもある
    static double[][] tfIdfMatrix(double[][] tfMatrix, Map<String, Integer> dictionary) {
        // tf*idf の一覧 tfIdf[文書番号][単語番号] で参照
        double[][] tfIdf = new double[tfMatrix.length][dictionary.size()];
        
        // それぞれの単語について逆出現頻度を計算
        for (int w = 0, n = dictionary.size(); w < n; w++) {
            // 各文書のi番目の単語について調べる
            int appearanceCount = 0;
            for (double[] vTf: tfMatrix) {
                // term frequenceが0でなければ、その単語は出現している
                if (vTf[w] != 0) {
                    appearanceCount++;
                }
            }
            // 逆出現頻度なので、総ドキュメント数 / 出現回数 = idf
            double idf = (double) tfMatrix.length / appearanceCount;
            
            // tf * idf というだけあり、tf * idfすればいい
            for (int d = 0; d < tfMatrix.length; d++) {
                tfIdf[d][w] = tfMatrix[d][w] * idf;
            }
        }
        return tfIdf;
    }

    private static void printDocuments(List<String> docs) {
        for (int i = 0, n = docs.size(); i < n; i++) {
            System.out.printf("document #%02d: %s%n", i, docs.get(i));
        }
    }
    
    private static void printDictionary(Map<String, Integer> dictionary) {
        String[] array = toArray(dictionary);
        for (int i = 0; i < array.length; i++) {
            System.out.printf("word #%04d: %s%n", i, array[i]);
        }
    }

    private static void printMatrix(double[][] tfMatrix) {
        System.out.print("doc-no \\ ");
        for (int i = 0; i < tfMatrix[0].length; i++) {
            System.out.printf("w#%04d ", i);
        }
        System.out.println();
        
        for (int i = 0; i < tfMatrix.length; i++) {
            System.out.printf("doc#%02d | ", i);
            double[] tfVector = tfMatrix[i];
            for (int j = 0; j < tfVector.length; j++) {
                System.out.printf("%6.2f ", tfVector[j]);
            }
            System.out.println("|");
        }
    }

    private static String[] toArray(Map<String, Integer> dictionary) {
        String[] array = new String[dictionary.size()];
        for (Map.Entry<String, Integer> entry: dictionary.entrySet()) {
            array[entry.getValue()] = entry.getKey();
        }
        return array;
    }
    
    // 上位3つのインデックスを返す。実装はかなりイマイチ
    private static int[] top3(double[] scores) {
        Set<Integer> saw = new HashSet<Integer>();
        int[] result = new int[3];
        for (int i = 0; i < 3; i++) {
            double top = Double.NEGATIVE_INFINITY;
            int topAt = -1;
            for (int j = 0; j < scores.length; j++) {
                double s = scores[j];
                if (top < s && !saw.contains(j)) {
                    top = s;
                    topAt = j;
                }
            }
            if (topAt != -1) {
                result[i] = topAt;
                saw.add(topAt);
            }
            else {
                result[i] = -1;
                break;
            }
        }
        return result;
    }
}

Java言語仕様第3版から下記のセクションの冒頭部分を検索対象ドキュメントに。

4.1 The Kinds of Types and Values
4.2 Primitive Types and Values
4.3 Reference Types and Values
4.4 Type Variables
4.5 Parameterized Types
4.6 Type Erasure
4.7 Reifiable Types
Java SE Specifications

検索クエリは、8.1.2 Generic Classes and Type Parametersというジェネリクスに絡むセクションの冒頭を利用。

A class is generic if it declares one or more type variables (§4.4). These type variables are known as the type parameters of the class. The type parameter section follows the class name and is delimited by angle brackets. It defines one or more type variables that act as parameters. A generic class declaration defines a set of parameterized types, one for each possible invocation of the type parameter section. All of these parameterized types share the same class at runtime.
Java SE Specifications

ジェネリクスの用語がかなりクエリに含まれるので、検索対象ドキュメントのうちジェネリクスに関連するドキュメントが引っ張ってこれれば成功。

実行結果。

...(かなりの勢いで省略)

===== ドキュメント類似度 =====
doc#03=0.064 : 4.4 Type Variables: ...
doc#04=0.042 : 4.5 Parameterized Types: ...
doc#05=0.039 : 4.6 Type Erasure: ...

===== 関連語 =====
declarations: 0.064
erasure: 0.021
variable: 0.016

ということで成功。関連語がいまいちだけど、まぁそれなりにわからなくもない感じでしょうか。

実際にはドキュメント数も辞書サイズもひどいことになるので、doubleなんかで実装するのは非常に下策です。疎行列を使って計算したほうがいいと思います。また、スコアで枝刈りをしないと現実的な速度にならないので、そちらも注意が必要です。