TermsResultSet.java

  1. /*
  2.  *  TermsResultSet.java
  3.  *
  4.  *  Copyright (c) 2007-2011, The University of Sheffield.
  5.  *
  6.  *  This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html),
  7.  *  and is free software, licenced under the GNU Lesser General Public License,
  8.  *  Version 3, June 2007 (also included with this distribution as file
  9.  *  LICENCE-LGPL3.html).
  10.  *  
  11.  *  Valentin Tablan, 13 Jul 2012
  12.  *
  13.  *  $Id: TermsResultSet.java 19444 2016-06-28 16:38:18Z ian_roberts $
  14.  */
  15. package gate.mimir.search.terms;

  16. import gate.mimir.SemanticAnnotationHelper;
  17. import it.unimi.dsi.fastutil.Arrays;
  18. import it.unimi.dsi.fastutil.ints.AbstractIntComparator;
  19. import it.unimi.dsi.fastutil.ints.IntArrayList;
  20. import it.unimi.dsi.fastutil.objects.Object2ObjectMap;
  21. import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
  22. import it.unimi.dsi.fastutil.objects.ObjectArrayList;
  23. import it.unimi.dsi.fastutil.objects.ObjectIterator;
  24. import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;

  25. import java.io.Serializable;
  26. import java.util.ArrayList;
  27. import java.util.Collections;
  28. import java.util.HashSet;
  29. import java.util.Iterator;
  30. import java.util.List;
  31. import java.util.Set;

  32. /**
  33.  * Class representing the results of a {@link TermsQuery}.
  34.  * A terms result set is a set of terms, represented by their
  35.  * {@link #termStrings}. Optionally {@link #termCounts},
  36.  * {@link #termDescriptions}, and {@link #termLengths} may also be available.
  37.  */
  38. public class TermsResultSet implements Serializable {
  39.  
  40.   /**
  41.    * Serialization ID.
  42.    */
  43.   private static final long serialVersionUID = -7722325563637139625L;

  44.  
  45.   /**
  46.    * The lengths (number of tokens) for the terms. Array parallel with
  47.    * {@link #termStrings}, and {@link #termDescriptions}.
  48.    */
  49.   public final int[] termLengths;
  50.  
  51.   /**
  52.    * The strings for the terms. Array parallel with
  53.    * {@link #termCounts} and {@link #termDescriptions}.
  54.    */
  55.   public final String[] termStrings;

  56.   /**
  57.    * This field is populated by the
  58.    * {@link #groupByDescription(TermsResultSet...)} method. It contains term
  59.    * strings from the original result sets indexed by position in this result
  60.    * set, and the index of the results set. For example
  61.    * originalTermStrings[i][j] is a String[], containing all the term strings
  62.    * associated with termDescriptions[i] in the j<sup>th</sup> result set.  
  63.    */
  64.   public String[][][] originalTermStrings;
  65.  
  66.   /**
  67.    * For annotation indexes, the term string is simply a URI in whatever format
  68.    * is used by the {@link SemanticAnnotationHelper} that was used to index the
  69.    * annotations. These URIs are not useful outside of the annotation helper
  70.    * and index, so term descriptions can be requested. If term descriptions were
  71.    * produced during the search, they are stored in this array (which is aligned
  72.    *  with {@link #termIds} and {@link #termCounts}).
  73.    */
  74.   public final String[] termDescriptions;
  75.  
  76.   /**
  77.    * The counts (numbers of occurrences) for the terms. Array parallel with
  78.    * {@link #termStrings} and {@link #termIds}.
  79.    */
  80.   public final int[] termCounts;
  81.  
  82.   public TermsResultSet(String[] termStrings, int[] termLengths,
  83.                         int[] termCounts, String[] termDescriptions) {
  84.     super();
  85.     this.termStrings = termStrings;
  86.     this.termLengths = termLengths;
  87.     this.termCounts = termCounts;
  88.     this.termDescriptions = termDescriptions;
  89.   }
  90.  
  91.   /**
  92.    * Constant representing the empty result set.
  93.    */
  94.   public static final TermsResultSet EMPTY = new TermsResultSet(
  95.       new String[]{}, new int[] {}, new int[]{}, new String[]{});
  96.  
  97.  
  98.   /**
  99.    * Given a position in {@link #termDescriptions}, this method computes all
  100.    * term strings that had that description in each of the sub-indexes of the
  101.    * federated index that produced this result set.
  102.    * @param termPosition the term for which the original term strings are being
  103.    * requested.
  104.    * @return An array where element at position i is an array containing all the
  105.    * term strings (in the dictionary of sub-index i) that had the given term
  106.    * description when the original query was answered by sub-index i, or null
  107.    * if original terms strings are not available.
  108.    */
  109.   public String[][] getSubIndexTerms(int termPosition) {
  110.     return (originalTermStrings != null) ?
  111.         originalTermStrings[termPosition] : null;
  112.   }
  113.  
  114.   /**
  115.    * Tries to locate the correct term position and calls
  116.    * {@link #getSubIndexTerms(int)}.
  117.    * @param termString
  118.    * @return
  119.    */
  120.   public String[][] getSubIndexTerms(String termString) {
  121.     int termPos = -1;
  122.     try {
  123.       termPos = Integer.parseInt(termString);
  124.     } catch (Exception e) {}
  125.     if(termStrings[termPos].equals(termString)) {
  126.       return getSubIndexTerms(termPos);
  127.     } else{
  128.       // could not convert it: leave it unchanged
  129.       return new String[][]{{termString}};
  130.     }
  131.   }
  132.  
  133.   /**
  134.    * Sorts the arrays inside a {@link TermsResultSet} using the termString for
  135.    * comparison.
  136.    * @param trs
  137.    */
  138.   public static void sortTermsResultSetByTermString(final TermsResultSet trs) {
  139.     Arrays.quickSort(0, trs.termStrings.length, new AbstractIntComparator() {
  140.       @Override
  141.       public int compare(int k1, int k2) {
  142.         return trs.termStrings[k1].compareTo(trs.termStrings[k2]);
  143.       }
  144.     }, new Swapper(trs));
  145.   }

  146.   /**
  147.    * Enumerates a result set and produces a new one after removing all the terms
  148.    * with descriptions in the banned list.
  149.    * @param bannedDescriptions A String array containing all the banned term
  150.    * descriptions.
  151.    * @param setToFilter the terms result set to filter
  152.    * @return the filtered result set.
  153.    */
  154.   public static TermsResultSet filterByDescriptionNot(TermsResultSet setToFilter, String... bannedDescriptions) {
  155.     final boolean descriptionsAvailable = setToFilter.termDescriptions != null;
  156.     if(!descriptionsAvailable) return setToFilter;
  157.    
  158.     final boolean countsAvailable = setToFilter.termCounts != null;
  159.     final boolean lengthsAvailable = setToFilter.termLengths != null;
  160.     final boolean origTermsAvailable = setToFilter.originalTermStrings != null;
  161.    
  162.     IntArrayList counts = countsAvailable ? new IntArrayList() : null;
  163.     IntArrayList lengths = lengthsAvailable ? new IntArrayList() : null;
  164.     ObjectArrayList<String> strings = new ObjectArrayList<String>();
  165.     ObjectArrayList<String> descriptions = new ObjectArrayList<String>();
  166.     ObjectArrayList<String[][]> origTerms = new ObjectArrayList<String[][]>();
  167.     ObjectOpenHashSet<String> bannedSet = new ObjectOpenHashSet<String>(bannedDescriptions);
  168.    
  169.     for(int i = 0; i < setToFilter.termDescriptions.length; i++) {
  170.       if(!bannedSet.contains(setToFilter.termDescriptions[i])) {
  171.         descriptions.add(setToFilter.termDescriptions[i]);
  172.         strings.add(setToFilter.termStrings[i]);
  173.         if(countsAvailable) counts.add(setToFilter.termCounts[i]);
  174.         if(lengthsAvailable) lengths.add(setToFilter.termLengths[i]);
  175.         if(origTermsAvailable)origTerms.add(setToFilter.originalTermStrings[i]);
  176.       }
  177.     }
  178.     int size = descriptions.size();
  179.     TermsResultSet res = new TermsResultSet(
  180.       strings.toArray(new String[size]),
  181.       lengthsAvailable ? lengths.toArray(new int[size]) : null,
  182.       countsAvailable ? counts.toArray(new int[size]) : null,
  183.       descriptions.toArray(new String[size]));
  184.     if(origTermsAvailable) res.originalTermStrings =
  185.         origTerms.toArray(new String[size][][]);
  186.     return res;
  187.   }
  188.  
  189.   /**
  190.    * This method re-arranges the data included in one or more
  191.    * {@link TermsResultSet} values so that each term description occurs only
  192.    * once in the {@link #termDescriptions} array.
  193.    *
  194.    * A {@link TermsResultSet} obtained when calling
  195.    * {@link TermsQuery#execute(gate.mimir.search.QueryEngine)} may include the
  196.    * same description for multiple term strings: depending on the implementation
  197.    * used to describe terms, distinct terms may end up with the same
  198.    * description. This could cause confusion when the output is presented to
  199.    * the user, as they would have no way to distinguish between the different
  200.    * terms.
  201.    *
  202.    * When executing a terms query against a federated index, each sub-index
  203.    * returns its own result set. Terms originating in different sub-indexes can
  204.    * have the same description.
  205.    *
  206.    * This method combines these into a unified result set that preserves the
  207.    * right term ID to term description mappings by populating the
  208.    * {@link #originalTermStrings} array.
  209.    *
  210.    * @param resSets the result sets produced by the sub-indexes of a federated
  211.    * index.
  212.    * @return the combined result set.
  213.    */
  214.   public static TermsResultSet groupByDescription(TermsResultSet... resSets) {
  215.     boolean descriptionsAvaialble = true;
  216.     boolean countsAvaialble = true;
  217.     boolean lengthsAvaialble = false;
  218.     for(TermsResultSet trs : resSets) {
  219.       if(trs.termDescriptions == null) {
  220.         descriptionsAvaialble = false;
  221.       }
  222.       if(trs.termCounts == null) {
  223.         countsAvaialble = false;
  224.       }
  225.       if(trs.termLengths != null) {
  226.         lengthsAvaialble = true;
  227.       }
  228.     }

  229.     Object2ObjectOpenHashMap<String, TermData> desc2TermData =
  230.         new Object2ObjectOpenHashMap<String, TermData>();
  231.    
  232.     for(int subIndexPos = 0; subIndexPos < resSets.length; subIndexPos++) {
  233.       TermsResultSet trs = resSets[subIndexPos];
  234.       for(int i = 0; i < trs.termStrings.length; i++) {
  235.         String description = descriptionsAvaialble ?
  236.             trs.termDescriptions[i] : trs.termStrings[i];
  237. //          String string = descriptionsAvaialble ? trs.termStrings[i] : null;
  238.         // get all the strings describing the current term
  239.         String[] strings = null;
  240.         if(trs.originalTermStrings != null) {
  241.           // old TRS already has original term strings
  242.           if(trs.originalTermStrings[i].length == 1) {
  243.             // old TRS was not federated
  244.             strings = trs.originalTermStrings[i][0];
  245.           } else {
  246.             // old TRS was federated: get the term strings from the correct sub-index
  247.             strings = trs.originalTermStrings[i][subIndexPos];
  248.           }
  249.         } else {
  250.           // no old original term strings: use the actual term string
  251.           strings = descriptionsAvaialble ?
  252.               new String[]{trs.termStrings[i]} : null;
  253.         }
  254.        
  255.         TermData tData = desc2TermData.get(description);
  256.         if(tData == null) {
  257.           tData = new TermData(description, resSets.length);
  258.           desc2TermData.put(description, tData);
  259.         }
  260.         if(descriptionsAvaialble && strings != null){
  261.           for(String s : strings) tData.addString(subIndexPos, s);
  262. //          tData.addString(subIndexPos, string);
  263.         }
  264.         if(countsAvaialble) {
  265.           tData.count += trs.termCounts[i];
  266.         }
  267.         if(lengthsAvaialble && trs.termLengths != null && tData.length < 0) {
  268.           tData.length = trs.termLengths[i];
  269.         }
  270.       }
  271.     }
  272.     // produce the compound result set
  273.     String[] newStrings = new String[desc2TermData.size()];
  274.     String[] newDescriptions = descriptionsAvaialble ?
  275.         new String[desc2TermData.size()] : null;
  276.     int[] newCounts = countsAvaialble ? new int[desc2TermData.size()] : null;
  277.     int[] newLenghts = lengthsAvaialble ? new int[desc2TermData.size()] : null;
  278.     String[][][] originalTermStrings = descriptionsAvaialble ?
  279.       new String[desc2TermData.size()][][] : null;
  280.     ObjectIterator<Object2ObjectMap.Entry<String, TermData>> iter =
  281.         desc2TermData.object2ObjectEntrySet().fastIterator();    
  282.     int pos = 0;
  283.     while(iter.hasNext()) {
  284.       TermData tData = iter.next().getValue();
  285.       if(descriptionsAvaialble) {
  286.         newDescriptions[pos] = tData.description;
  287.         originalTermStrings[pos] = tData.getStrings();
  288.         // term string does not actually mean anything;
  289.         // we use the term position instead
  290.         // newStrings[pos] = Integer.toString(pos);
  291.         Set<String> uniq = new HashSet<String>();
  292.         for(String[] terms : originalTermStrings[pos]) {
  293.           for(String term : terms) {
  294.             uniq.add(term);
  295.           }
  296.         }
  297.         if(uniq.isEmpty()) {
  298.           newStrings[pos] = Integer.toString(pos);
  299.         } else {
  300.           List<String> termList= new ArrayList<String>(uniq);
  301.           Collections.sort(termList);
  302.           StringBuilder strb = new StringBuilder(termList.get(0));
  303.           for(int i = 1; i < termList.size(); i++) {
  304.             strb.append(" | ").append(termList.get(i));
  305.           }
  306.           newStrings[pos] = strb.toString();          
  307.         }
  308.       } else {
  309.         newStrings[pos] = tData.description;
  310.       }
  311.       if(countsAvaialble) newCounts[pos] = tData.count;
  312.       if(lengthsAvaialble) newLenghts[pos] = tData.length;
  313.       pos++;
  314.     }
  315.    
  316.     TermsResultSet res = new TermsResultSet(newStrings, newLenghts, newCounts,
  317.       newDescriptions);
  318.     res.originalTermStrings = originalTermStrings;
  319.     return res;
  320.   }
  321.  
  322.   /**
  323.    * Class used internally to store the term data when grouping terms results sets.
  324.    * See {@link TermsResultSet#groupByDescription(TermsResultSet...)}.
  325.    */
  326.   private static class TermData {
  327.     private String description;
  328.     private int count;
  329.     private int length;
  330.    
  331.     /**
  332.      * The number of result sets being combined
  333.      */
  334.     private int arity;
  335.    
  336.     /**
  337.      * An array of size {@link #arity}, element at position i containing the
  338.      * term strings in the result set at position i, for this term description.
  339.      */
  340.     private ObjectArrayList<String>[] strings;

  341.     public TermData(String description, int arity) {
  342.       super();
  343.       this.description = description;
  344.       this.arity = arity;
  345.       strings = new ObjectArrayList[arity];
  346.       this.count = 0;
  347.       this.length = -1;
  348.     }
  349.    
  350.     /**
  351.      * Adds a new term string for the sub-index at a given position.
  352.      * @param position
  353.      * @param string
  354.      */
  355.     public void addString(int position, String string) {
  356.       if(strings[position] == null) {
  357.         strings[position] = new ObjectArrayList<String>();
  358.       }
  359.       strings[position].add(string);
  360.     }
  361.    
  362.     public String[][] getStrings() {
  363.       String[][] res = new String[strings.length][];
  364.       for(int i = 0; i < strings.length; i++) {
  365.         if(strings[i] == null) {
  366.           res[i] = new String[0];
  367.         } else {
  368.           res[i] = strings[i].toArray(new String[strings[i].size()]);
  369.         }
  370.       }
  371.       return res;
  372.     }
  373.   }
  374.  
  375.   /**
  376.    * A {@link it.unimi.dsi.fastutil.Swapper} implementation for
  377.    * {@link TermsResultSet}s.
  378.    */
  379.   public static class Swapper implements it.unimi.dsi.fastutil.Swapper {
  380.     private TermsResultSet trs;
  381.    
  382.     public Swapper(TermsResultSet trs) {
  383.       this.trs = trs;
  384.     }
  385.    
  386.     @Override
  387.     public void swap(int a, int b) {
  388.       String termString = trs.termStrings[a];
  389.       trs.termStrings[a] = trs.termStrings[b];
  390.       trs.termStrings[b] = termString;
  391.       if(trs.termCounts != null) {
  392.         int termCount = trs.termCounts[a];
  393.         trs.termCounts[a] = trs.termCounts[b];
  394.         trs.termCounts[b] = termCount;
  395.       }
  396.       if(trs.termLengths != null) {
  397.         int termLength = trs.termLengths[a];
  398.         trs.termLengths[a] = trs.termLengths[b];
  399.         trs.termLengths[b] = termLength;
  400.       }
  401.       if(trs.termDescriptions != null) {
  402.         String termDesc = trs.termDescriptions[a];
  403.         trs.termDescriptions[a] = trs.termDescriptions[b];
  404.         trs.termDescriptions[b] = termDesc;
  405.       }
  406.       if(trs.originalTermStrings != null) {
  407.         String[][] origTSs = trs.originalTermStrings[a];
  408.         trs.originalTermStrings[a] = trs.originalTermStrings[b];
  409.         trs.originalTermStrings[b] = origTSs;
  410.       }
  411.     }
  412.   }
  413. }