TF-IDF[2]
package com.sap.research.semantic;import java.io.File;import java.util.ArrayList;import java.util.HashMap;import java.util.Map;import javax.print.attribute.standard.OutputDeviceAssigned;import cc.mallet.types.Vector;import cc.mallet.util.CommandOption.Set;import com.sap.research.util.Pair;public class SemanticInterpretor {private InvertedIndex invertedIndex;public SemanticInterpretor(InvertedIndex invertedIndex) {this.invertedIndex = invertedIndex;}// vector representation for a document based on concepts in DBpublic HashMap<String, Double> vectorRepreforDoc(File file) {HashMap<String, Pair<String, Integer>> docVec = invertedIndex.termCount(file);// vector representation for a document, only term frequency considered// HashMap<term, frequency>HashMap<String, Integer> vec = new HashMap<String, Integer>();for (Map.Entry<String, Pair<String, Integer>> entry : docVec.entrySet()) {vec.put(entry.getKey(), entry.getValue().second);}// semantical vector representation for a document// HashMap<concept, relatedness>HashMap<String, Double> semanticVec = new HashMap<String, Double>();// vector representation based on conceptsHashMap<String, ArrayList<Pair<String, Double>>> indices = invertedIndex.getInvertedIndicesDouble();for (Map.Entry<String, Integer> entry : vec.entrySet()) {if (indices.keySet().contains(entry.getKey())) {// if certain word is contained in the invertedIndexArrayList<Pair<String, Double>> pairList = indices.get(entry.getKey());for (Pair<String, Double> pair : pairList) {double relatedness = entry.getValue() * pair.second;if (!semanticVec.keySet().contains(pair.first)) {semanticVec.put(pair.first, relatedness);} else {semanticVec.put(pair.first, semanticVec.get(pair.first)+ relatedness);}}} else {// if certain word is not contained in the invertedIndex// simply ignore?}}output(semanticVec);return semanticVec;}private void output(HashMap<String, Double> semanticVec) {System.out.println(semanticVec);}public double semanticalSimilarity(HashMap<String, Double> docVec1,HashMap<String, Double> docVec2) {ArrayList<String> base = new ArrayList<String>();for (Map.Entry<String, Double> entry : docVec1.entrySet()) {base.add(entry.getKey());}for (Map.Entry<String, Double> entry : docVec2.entrySet()) {if (!base.contains(entry.getKey())) {base.add(entry.getKey());}}HashMap<String, Double> docVec1Std = new HashMap<String, Double>();ArrayList<Double> docVec1Double = new ArrayList<Double>();HashMap<String, Double> docVec2Std = new HashMap<String, Double>();ArrayList<Double> docVec2Double = new ArrayList<Double>();for (String string : base) {if (docVec1.keySet().contains(string)) {docVec1Std.put(string, docVec1.get(string));docVec1Double.add(docVec1.get(string));} else {docVec1Std.put(string, 0.0);docVec1Double.add(0.0);}if (docVec2.keySet().contains(string)) {docVec2Std.put(string, docVec2.get(string));docVec2Double.add(docVec2.get(string));} else {docVec2Std.put(string, 0.0);docVec2Double.add(0.0);}}output(docVec1Std);output(docVec2Std);double similarity = 0.0;if (docVec1Double.size() == docVec2Double.size()) {double prod = 0.0;double doc1Sqr = 0.0;double doc2Sqr = 0.0;for (int i = 0; i < docVec1Double.size(); i++) {prod += docVec1Double.get(i) * docVec2Double.get(i);doc1Sqr += docVec1Double.get(i) * docVec1Double.get(i);doc2Sqr += docVec2Double.get(i) * docVec2Double.get(i);}similarity = prod / (Math.sqrt(doc1Sqr) * Math.sqrt(doc2Sqr));} else {System.err.println("cannot compare vectors of different length!");}return similarity;}public double computeSemanticRelatedness(File fileA, File fileB) {// invertedIndex.addIndices(dirPath, filePath, recursive)return semanticalSimilarity(vectorRepreforDoc(fileA),vectorRepreforDoc(fileB));}public static void main(String[] args) {String propertyFilePath = "settings/html.properties";InvertedIndex invertedIndex = new InvertedIndex(propertyFilePath);invertedIndex.addIndices("test/input/html_20", "html", true);SemanticInterpretor interpretor = new SemanticInterpretor(invertedIndex);File fileA = new File("test/input/html/topic1.html");File fileB = new File("test/input/html/topic2.html");double similarity = interpretor.computeSemanticRelatedness(fileA, fileB);System.out.println("Semantic relatedness of " + fileA.getName()+ " and " + fileB.getName() + " is: " + similarity);}}?