TermsResultSet.java
- /*
- * TermsResultSet.java
- *
- * Copyright (c) 2007-2011, The University of Sheffield.
- *
- * This file is part of GATE MÃmir (see http://gate.ac.uk/family/mimir.html),
- * and is free software, licenced under the GNU Lesser General Public License,
- * Version 3, June 2007 (also included with this distribution as file
- * LICENCE-LGPL3.html).
- *
- * Valentin Tablan, 13 Jul 2012
- *
- * $Id: TermsResultSet.java 19444 2016-06-28 16:38:18Z ian_roberts $
- */
- package gate.mimir.search.terms;
- import gate.mimir.SemanticAnnotationHelper;
- import it.unimi.dsi.fastutil.Arrays;
- import it.unimi.dsi.fastutil.ints.AbstractIntComparator;
- import it.unimi.dsi.fastutil.ints.IntArrayList;
- import it.unimi.dsi.fastutil.objects.Object2ObjectMap;
- import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
- import it.unimi.dsi.fastutil.objects.ObjectArrayList;
- import it.unimi.dsi.fastutil.objects.ObjectIterator;
- import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
- import java.io.Serializable;
- import java.util.ArrayList;
- import java.util.Collections;
- import java.util.HashSet;
- import java.util.Iterator;
- import java.util.List;
- import java.util.Set;
- /**
- * Class representing the results of a {@link TermsQuery}.
- * A terms result set is a set of terms, represented by their
- * {@link #termStrings}. Optionally {@link #termCounts},
- * {@link #termDescriptions}, and {@link #termLengths} may also be available.
- */
- public class TermsResultSet implements Serializable {
-
- /**
- * Serialization ID.
- */
- private static final long serialVersionUID = -7722325563637139625L;
-
- /**
- * The lengths (number of tokens) for the terms. Array parallel with
- * {@link #termStrings}, and {@link #termDescriptions}.
- */
- public final int[] termLengths;
-
- /**
- * The strings for the terms. Array parallel with
- * {@link #termCounts} and {@link #termDescriptions}.
- */
- public final String[] termStrings;
- /**
- * This field is populated by the
- * {@link #groupByDescription(TermsResultSet...)} method. It contains term
- * strings from the original result sets indexed by position in this result
- * set, and the index of the results set. For example
- * originalTermStrings[i][j] is a String[], containing all the term strings
- * associated with termDescriptions[i] in the j<sup>th</sup> result set.
- */
- public String[][][] originalTermStrings;
-
- /**
- * For annotation indexes, the term string is simply a URI in whatever format
- * is used by the {@link SemanticAnnotationHelper} that was used to index the
- * annotations. These URIs are not useful outside of the annotation helper
- * and index, so term descriptions can be requested. If term descriptions were
- * produced during the search, they are stored in this array (which is aligned
- * with {@link #termIds} and {@link #termCounts}).
- */
- public final String[] termDescriptions;
-
- /**
- * The counts (numbers of occurrences) for the terms. Array parallel with
- * {@link #termStrings} and {@link #termIds}.
- */
- public final int[] termCounts;
-
- public TermsResultSet(String[] termStrings, int[] termLengths,
- int[] termCounts, String[] termDescriptions) {
- super();
- this.termStrings = termStrings;
- this.termLengths = termLengths;
- this.termCounts = termCounts;
- this.termDescriptions = termDescriptions;
- }
-
- /**
- * Constant representing the empty result set.
- */
- public static final TermsResultSet EMPTY = new TermsResultSet(
- new String[]{}, new int[] {}, new int[]{}, new String[]{});
-
-
- /**
- * Given a position in {@link #termDescriptions}, this method computes all
- * term strings that had that description in each of the sub-indexes of the
- * federated index that produced this result set.
- * @param termPosition the term for which the original term strings are being
- * requested.
- * @return An array where element at position i is an array containing all the
- * term strings (in the dictionary of sub-index i) that had the given term
- * description when the original query was answered by sub-index i, or null
- * if original terms strings are not available.
- */
- public String[][] getSubIndexTerms(int termPosition) {
- return (originalTermStrings != null) ?
- originalTermStrings[termPosition] : null;
- }
-
- /**
- * Tries to locate the correct term position and calls
- * {@link #getSubIndexTerms(int)}.
- * @param termString
- * @return
- */
- public String[][] getSubIndexTerms(String termString) {
- int termPos = -1;
- try {
- termPos = Integer.parseInt(termString);
- } catch (Exception e) {}
- if(termStrings[termPos].equals(termString)) {
- return getSubIndexTerms(termPos);
- } else{
- // could not convert it: leave it unchanged
- return new String[][]{{termString}};
- }
- }
-
- /**
- * Sorts the arrays inside a {@link TermsResultSet} using the termString for
- * comparison.
- * @param trs
- */
- public static void sortTermsResultSetByTermString(final TermsResultSet trs) {
- Arrays.quickSort(0, trs.termStrings.length, new AbstractIntComparator() {
- @Override
- public int compare(int k1, int k2) {
- return trs.termStrings[k1].compareTo(trs.termStrings[k2]);
- }
- }, new Swapper(trs));
- }
- /**
- * Enumerates a result set and produces a new one after removing all the terms
- * with descriptions in the banned list.
- * @param bannedDescriptions A String array containing all the banned term
- * descriptions.
- * @param setToFilter the terms result set to filter
- * @return the filtered result set.
- */
- public static TermsResultSet filterByDescriptionNot(TermsResultSet setToFilter, String... bannedDescriptions) {
- final boolean descriptionsAvailable = setToFilter.termDescriptions != null;
- if(!descriptionsAvailable) return setToFilter;
-
- final boolean countsAvailable = setToFilter.termCounts != null;
- final boolean lengthsAvailable = setToFilter.termLengths != null;
- final boolean origTermsAvailable = setToFilter.originalTermStrings != null;
-
- IntArrayList counts = countsAvailable ? new IntArrayList() : null;
- IntArrayList lengths = lengthsAvailable ? new IntArrayList() : null;
- ObjectArrayList<String> strings = new ObjectArrayList<String>();
- ObjectArrayList<String> descriptions = new ObjectArrayList<String>();
- ObjectArrayList<String[][]> origTerms = new ObjectArrayList<String[][]>();
- ObjectOpenHashSet<String> bannedSet = new ObjectOpenHashSet<String>(bannedDescriptions);
-
- for(int i = 0; i < setToFilter.termDescriptions.length; i++) {
- if(!bannedSet.contains(setToFilter.termDescriptions[i])) {
- descriptions.add(setToFilter.termDescriptions[i]);
- strings.add(setToFilter.termStrings[i]);
- if(countsAvailable) counts.add(setToFilter.termCounts[i]);
- if(lengthsAvailable) lengths.add(setToFilter.termLengths[i]);
- if(origTermsAvailable)origTerms.add(setToFilter.originalTermStrings[i]);
- }
- }
- int size = descriptions.size();
- TermsResultSet res = new TermsResultSet(
- strings.toArray(new String[size]),
- lengthsAvailable ? lengths.toArray(new int[size]) : null,
- countsAvailable ? counts.toArray(new int[size]) : null,
- descriptions.toArray(new String[size]));
- if(origTermsAvailable) res.originalTermStrings =
- origTerms.toArray(new String[size][][]);
- return res;
- }
-
- /**
- * This method re-arranges the data included in one or more
- * {@link TermsResultSet} values so that each term description occurs only
- * once in the {@link #termDescriptions} array.
- *
- * A {@link TermsResultSet} obtained when calling
- * {@link TermsQuery#execute(gate.mimir.search.QueryEngine)} may include the
- * same description for multiple term strings: depending on the implementation
- * used to describe terms, distinct terms may end up with the same
- * description. This could cause confusion when the output is presented to
- * the user, as they would have no way to distinguish between the different
- * terms.
- *
- * When executing a terms query against a federated index, each sub-index
- * returns its own result set. Terms originating in different sub-indexes can
- * have the same description.
- *
- * This method combines these into a unified result set that preserves the
- * right term ID to term description mappings by populating the
- * {@link #originalTermStrings} array.
- *
- * @param resSets the result sets produced by the sub-indexes of a federated
- * index.
- * @return the combined result set.
- */
- public static TermsResultSet groupByDescription(TermsResultSet... resSets) {
- boolean descriptionsAvaialble = true;
- boolean countsAvaialble = true;
- boolean lengthsAvaialble = false;
- for(TermsResultSet trs : resSets) {
- if(trs.termDescriptions == null) {
- descriptionsAvaialble = false;
- }
- if(trs.termCounts == null) {
- countsAvaialble = false;
- }
- if(trs.termLengths != null) {
- lengthsAvaialble = true;
- }
- }
- Object2ObjectOpenHashMap<String, TermData> desc2TermData =
- new Object2ObjectOpenHashMap<String, TermData>();
-
- for(int subIndexPos = 0; subIndexPos < resSets.length; subIndexPos++) {
- TermsResultSet trs = resSets[subIndexPos];
- for(int i = 0; i < trs.termStrings.length; i++) {
- String description = descriptionsAvaialble ?
- trs.termDescriptions[i] : trs.termStrings[i];
- // String string = descriptionsAvaialble ? trs.termStrings[i] : null;
- // get all the strings describing the current term
- String[] strings = null;
- if(trs.originalTermStrings != null) {
- // old TRS already has original term strings
- if(trs.originalTermStrings[i].length == 1) {
- // old TRS was not federated
- strings = trs.originalTermStrings[i][0];
- } else {
- // old TRS was federated: get the term strings from the correct sub-index
- strings = trs.originalTermStrings[i][subIndexPos];
- }
- } else {
- // no old original term strings: use the actual term string
- strings = descriptionsAvaialble ?
- new String[]{trs.termStrings[i]} : null;
- }
-
- TermData tData = desc2TermData.get(description);
- if(tData == null) {
- tData = new TermData(description, resSets.length);
- desc2TermData.put(description, tData);
- }
- if(descriptionsAvaialble && strings != null){
- for(String s : strings) tData.addString(subIndexPos, s);
- // tData.addString(subIndexPos, string);
- }
- if(countsAvaialble) {
- tData.count += trs.termCounts[i];
- }
- if(lengthsAvaialble && trs.termLengths != null && tData.length < 0) {
- tData.length = trs.termLengths[i];
- }
- }
- }
- // produce the compound result set
- String[] newStrings = new String[desc2TermData.size()];
- String[] newDescriptions = descriptionsAvaialble ?
- new String[desc2TermData.size()] : null;
- int[] newCounts = countsAvaialble ? new int[desc2TermData.size()] : null;
- int[] newLenghts = lengthsAvaialble ? new int[desc2TermData.size()] : null;
- String[][][] originalTermStrings = descriptionsAvaialble ?
- new String[desc2TermData.size()][][] : null;
- ObjectIterator<Object2ObjectMap.Entry<String, TermData>> iter =
- desc2TermData.object2ObjectEntrySet().fastIterator();
- int pos = 0;
- while(iter.hasNext()) {
- TermData tData = iter.next().getValue();
- if(descriptionsAvaialble) {
- newDescriptions[pos] = tData.description;
- originalTermStrings[pos] = tData.getStrings();
- // term string does not actually mean anything;
- // we use the term position instead
- // newStrings[pos] = Integer.toString(pos);
- Set<String> uniq = new HashSet<String>();
- for(String[] terms : originalTermStrings[pos]) {
- for(String term : terms) {
- uniq.add(term);
- }
- }
- if(uniq.isEmpty()) {
- newStrings[pos] = Integer.toString(pos);
- } else {
- List<String> termList= new ArrayList<String>(uniq);
- Collections.sort(termList);
- StringBuilder strb = new StringBuilder(termList.get(0));
- for(int i = 1; i < termList.size(); i++) {
- strb.append(" | ").append(termList.get(i));
- }
- newStrings[pos] = strb.toString();
- }
- } else {
- newStrings[pos] = tData.description;
- }
- if(countsAvaialble) newCounts[pos] = tData.count;
- if(lengthsAvaialble) newLenghts[pos] = tData.length;
- pos++;
- }
-
- TermsResultSet res = new TermsResultSet(newStrings, newLenghts, newCounts,
- newDescriptions);
- res.originalTermStrings = originalTermStrings;
- return res;
- }
-
- /**
- * Class used internally to store the term data when grouping terms results sets.
- * See {@link TermsResultSet#groupByDescription(TermsResultSet...)}.
- */
- private static class TermData {
- private String description;
- private int count;
- private int length;
-
- /**
- * The number of result sets being combined
- */
- private int arity;
-
- /**
- * An array of size {@link #arity}, element at position i containing the
- * term strings in the result set at position i, for this term description.
- */
- private ObjectArrayList<String>[] strings;
- public TermData(String description, int arity) {
- super();
- this.description = description;
- this.arity = arity;
- strings = new ObjectArrayList[arity];
- this.count = 0;
- this.length = -1;
- }
-
- /**
- * Adds a new term string for the sub-index at a given position.
- * @param position
- * @param string
- */
- public void addString(int position, String string) {
- if(strings[position] == null) {
- strings[position] = new ObjectArrayList<String>();
- }
- strings[position].add(string);
- }
-
- public String[][] getStrings() {
- String[][] res = new String[strings.length][];
- for(int i = 0; i < strings.length; i++) {
- if(strings[i] == null) {
- res[i] = new String[0];
- } else {
- res[i] = strings[i].toArray(new String[strings[i].size()]);
- }
- }
- return res;
- }
- }
-
- /**
- * A {@link it.unimi.dsi.fastutil.Swapper} implementation for
- * {@link TermsResultSet}s.
- */
- public static class Swapper implements it.unimi.dsi.fastutil.Swapper {
- private TermsResultSet trs;
-
- public Swapper(TermsResultSet trs) {
- this.trs = trs;
- }
-
- @Override
- public void swap(int a, int b) {
- String termString = trs.termStrings[a];
- trs.termStrings[a] = trs.termStrings[b];
- trs.termStrings[b] = termString;
- if(trs.termCounts != null) {
- int termCount = trs.termCounts[a];
- trs.termCounts[a] = trs.termCounts[b];
- trs.termCounts[b] = termCount;
- }
- if(trs.termLengths != null) {
- int termLength = trs.termLengths[a];
- trs.termLengths[a] = trs.termLengths[b];
- trs.termLengths[b] = termLength;
- }
- if(trs.termDescriptions != null) {
- String termDesc = trs.termDescriptions[a];
- trs.termDescriptions[a] = trs.termDescriptions[b];
- trs.termDescriptions[b] = termDesc;
- }
- if(trs.originalTermStrings != null) {
- String[][] origTSs = trs.originalTermStrings[a];
- trs.originalTermStrings[a] = trs.originalTermStrings[b];
- trs.originalTermStrings[b] = origTSs;
- }
- }
- }
- }