首页 诗词 字典 板报 句子 名言 友答 励志 学校 网站地图
当前位置: 首页 > 教程频道 > 开发语言 > 编程 >

施用余弦相似性原理计算文本的相似度

2013-08-11 
使用余弦相似性原理计算文本的相似度原理参考:http://www.ruanyifeng.com/blog/2013/03/cosine_similarity

使用余弦相似性原理计算文本的相似度
原理参考:http://www.ruanyifeng.com/blog/2013/03/cosine_similarity.html

/** *  */package com.text;import java.io.IOException;import java.io.StringReader;import java.util.HashMap;import java.util.Map;import org.apache.commons.collections.MapUtils;import org.apache.commons.lang3.tuple.MutablePair;import org.apache.commons.lang3.tuple.Pair;import org.wltea.analyzer.core.IKSegmenter;import org.wltea.analyzer.core.Lexeme;/** * @author Riching *  * @date 2013-8-10 */public class IKMainTest {    /**     * @param args     * @throws IOException     */    public static void main(String[] args) throws IOException {        String str1 = "我喜欢看电视,不喜欢看电影。";        String str2 = "我不喜欢看电视,也不喜欢看电影。";        Map<String, Integer> tf1 = getTF(str1);        Map<String, Integer> tf2 = getTF(str2);        Map<String, MutablePair<Integer, Integer>> tfs = new HashMap<String, MutablePair<Integer, Integer>>();        for (String key : tf1.keySet()) {            MutablePair<Integer, Integer> pair = new MutablePair<Integer, Integer>(tf1.get(key), 0);            tfs.put(key, pair);        }        for (String key : tf2.keySet()) {            MutablePair<Integer, Integer> pair = tfs.get(key);            if (null == pair) {                pair = new MutablePair<Integer, Integer>(0, tf2.get(key));            } else {                pair.setRight(tf2.get(key));            }        }        double d = caclIDF(tfs);        System.out.println(d);    }    public static Map<String, Integer> getTF(String str) throws IOException {        Map<String, Integer> map = new HashMap<String, Integer>();        IKSegmenter ikSegmenter = new IKSegmenter(new StringReader(str), true);        Lexeme lexeme = null;        while ((lexeme = ikSegmenter.next()) != null) {            String key = lexeme.getLexemeText();            Integer count = map.get(key);            if (null == count) {                count = 1;            } else {                count = count + 1;            }            map.put(key, count);        }        return map;    }    public static double caclIDF(Map<String, MutablePair<Integer, Integer>> tf) {        double d = 0;        if (MapUtils.isEmpty(tf)) {            return d;        }        double denominator = 0;        double sqdoc1 = 0;        double sqdoc2 = 0;        Pair<Integer, Integer> count = null;        for (String key : tf.keySet()) {            count = tf.get(key);            denominator += count.getLeft() * count.getRight();            sqdoc1 += count.getLeft() * count.getLeft();            sqdoc2 += count.getRight() * count.getRight();        }        d = denominator / (Math.sqrt(sqdoc1) * Math.sqrt(sqdoc2));        return d;    }}

热点排行