RepeatsQuery.java

  1. /*
  2.  *  RepeatsQuery.java
  3.  *
  4.  *  Copyright (c) 2007-2011, The University of Sheffield.
  5.  *
  6.  *  This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html),
  7.  *  and is free software, licenced under the GNU Lesser General Public License,
  8.  *  Version 3, June 2007 (also included with this distribution as file
  9.  *  LICENCE-LGPL3.html).
  10.  *
  11.  *  Valentin Tablan, 17 Mar 2009
  12.  *    
  13.  *  $Id: RepeatsQuery.java 20208 2017-04-19 08:35:28Z domrout $
  14.  */
  15. package gate.mimir.search.query;


  16. import gate.mimir.search.QueryEngine;

  17. import it.unimi.dsi.fastutil.objects.ReferenceSet;
  18. import it.unimi.di.big.mg4j.index.Index;
  19. import it.unimi.di.big.mg4j.search.visitor.DocumentIteratorVisitor;

  20. import java.io.IOException;
  21. import java.util.*;


  22. /**
  23.  * A query node that wraps another query node. Results for this query are
  24.  * sequences of consecutive results from the wrapped query,  with lengths
  25.  * between specified minimum and maximum values.
  26.  * This is similar to a bounded Kleene operator, e.g. {Token}[2,3] in JAPE
  27.  * notation.
  28.  */
  29. public class RepeatsQuery implements QueryNode {

  30.   private static final long serialVersionUID = 8441324338156901268L;

  31.   /**
  32.    * A {@link QueryExecutor} for repeats queries.
  33.    */
  34.   public static class RepeatsQueryExecutor extends AbstractQueryExecutor{
  35.    
  36.     /**
  37.      * @param engine
  38.      * @param query
  39.      * @throws IOException
  40.      */
  41.     public RepeatsQueryExecutor(RepeatsQuery query, QueryEngine engine) throws IOException {
  42.       super(engine, query);
  43.       this.query = query;
  44.       this.wrappedExecutor = query.wrappedQuery.getQueryExecutor(engine);
  45.       hitsOnCurrentDocument = new LinkedList<Binding[]>();
  46.     }

  47.     /**
  48.      * Holds the hits on the current document. This data structure is populated
  49.      * by the {@link #nextDocument(int)} method, and is consumed by the
  50.      * {@link #nextHit()} method.
  51.      * Each entry is a hit (represented as a sequence of consecutive hits from
  52.      * the wrapped query).
  53.      */
  54.     protected List<Binding[]> hitsOnCurrentDocument;
  55.    
  56.     /**
  57.      * The query being executed.
  58.      */
  59.     protected RepeatsQuery query;
  60.    
  61.     /**
  62.      * The executor for the query wrapped by this RepeastQuery.
  63.      */
  64.     protected QueryExecutor wrappedExecutor;
  65.    
  66.    
  67.     /* (non-Javadoc)
  68.      * @see gate.mimir.search.query.QueryExecutor#close()
  69.      */
  70.     public void close() throws IOException {
  71.       if(closed) return;
  72.       super.close();
  73.       wrappedExecutor.close();
  74.       hitsOnCurrentDocument.clear();
  75.     }

  76.     /* (non-Javadoc)
  77.      * @see gate.mimir.search.query.QueryExecutor#nextDocument(int)
  78.      */
  79.     public long nextDocument(long greaterThan) throws IOException {
  80.       if(closed) return latestDocument = -1;
  81.       if(latestDocument == -1) return latestDocument;
  82.       long nextDoc = -1;
  83.       while(nextDoc < 0){
  84.         nextDoc = wrappedExecutor.nextDocument(greaterThan);
  85.         if(nextDoc < 0){
  86.           //no more docs
  87.           return latestDocument = nextDoc;
  88.         }
  89.         getHitsOnCurrentDocument();
  90.         if(hitsOnCurrentDocument.isEmpty()) nextDoc = -1;
  91.       }
  92.       return latestDocument = nextDoc;
  93.     }

  94.     protected void getHitsOnCurrentDocument() throws IOException{
  95.       //extract all hits on the current document
  96.       List<Binding> hits = new ArrayList<Binding>();
  97.       Binding aHit = wrappedExecutor.nextHit();
  98.       while(aHit != null){
  99.         hits.add(aHit);
  100.         aHit = wrappedExecutor.nextHit();
  101.       }
  102.       //calculate the marks for each hit, at each position in the sequence
  103.       int[][] marks = new int[query.max][];
  104.       for(int i = 0; i < marks.length; i++){
  105.         marks[i] = new int[hits.size()];
  106.       }
  107.       //we mark from right to left, starting from the penultimate list
  108.       //for the last list, all elements are marked as valid
  109.       //VT: next lines removed, as Java initialises arrays with 0
  110. //      for(int i = 0; i < marks[marks.length -1].length; i++){
  111. //        marks[marks.length -1][i] = 0;
  112. //      }
  113.       int currentSlot = marks.length -2;
  114.       while(currentSlot >= 0){
  115.         //Should we only mark an entry if its next slot has a suffix itself?
  116.         boolean suffixCompulsory = currentSlot < query.min -2;
  117.         for(int i = 0 ; i < hits.size(); i++ ){
  118.           //for each candidate hit
  119.           int nextStart = hits.get(i).getTermPosition() +
  120.               hits.get(i).getLength();
  121.           //find the first hit in the next list and store its pointer
  122.           //search for the first valid next hit
  123.           //we start from the current hit position +1 (hits are ordered, and a
  124.           //hit cannot follow itself)
  125.           int nextHitIdx = i + 1;
  126.           //first skip the hits too small
  127.           while(nextHitIdx < marks[currentSlot + 1].length &&
  128.                 hits.get(nextHitIdx).getTermPosition() < nextStart){
  129.             nextHitIdx++;
  130.           }
  131.           //next process all candidates, until we find a valid one, or we run out
  132.           marks[currentSlot][i] = -1;
  133.           while(nextHitIdx < marks[currentSlot + 1].length &&
  134.                 hits.get(nextHitIdx).getTermPosition() == nextStart){
  135.             //A good hit is one that:
  136.             // - starts at nextStart, and
  137.             // - if suffixCompulsory, has a non-negative mark itself
  138.             if(suffixCompulsory){
  139.               if(marks[currentSlot +1][nextHitIdx] >= 0){
  140.                 //we found the next hit
  141.                 marks[currentSlot][i] = nextHitIdx;
  142.                 break;
  143.               }else{
  144.                 nextHitIdx++;
  145.               }
  146.             }else{
  147.               //no other conditions required
  148.               marks[currentSlot][i] = nextHitIdx;
  149.               break;
  150.             }
  151.           }
  152.         }//for(int i = 0 ; i < hits.size(); i++ )
  153.         currentSlot--;
  154.       }//while(currentSlot >= 0)
  155.      
  156.       //we finished marking, now we collect results from left to right
  157.       hitsOnCurrentDocument.clear();
  158.       //for each marked starting point on the first slot, start enumerating the
  159.       //hits
  160.       for(int i = 0; i < marks[0].length; i++){
  161.         if(marks[0][i] >= 0 || query.min == 1){
  162.           Binding[] slots = new Binding[query.max];
  163.           extractHitsRec(query, hitsOnCurrentDocument, hits, marks, 0, i, slots);
  164.         }
  165.       }
  166.      
  167.     }
  168.    
  169.     /**
  170.      * Recursively extracts the hits
  171.      * @param hits the arrays of hits, one row for each of the {@link #executorsCache}.
  172.      * @param marks the marks for the hits
  173.      * @param currentSlot which slot to fill at this stage (used for recursion).
  174.      * @param currentHit for the current slot, which candidate hit to start from.
  175.      * @param slots the array of slots that needs filling.
  176.      * @see #computeMark(int, int, Binding[], int[])
  177.      */
  178.     protected static void extractHitsRec(RepeatsQuery query,
  179.             List<Binding[]> results, List<Binding> hits, int[][] marks,
  180.             int currentSlot, int currentHit, Binding[] slots){
  181.       slots[currentSlot] = hits.get(currentHit);
  182.       if(currentSlot >= query.min -1){
  183.         //we have a full result -> make a copy of the right length and add it
  184.         Binding[] aResult = new Binding[currentSlot +1];
  185.         for(int i = 0; i<= currentSlot; i++) aResult[i] = slots[i];
  186.         results.add(aResult);
  187.       }
  188.       //if we're not at the end yet, keep recursing
  189.       if(currentSlot < query.max -1){
  190.         if(marks[currentSlot][currentHit] >=0){
  191.           //recursive call for the first next candidate
  192.           extractHitsRec(query, results, hits, marks, currentSlot + 1,
  193.                   marks[currentSlot][currentHit], slots);
  194.           //now find all other candidates for next slot
  195.           for(int i = marks[currentSlot][currentHit] +1;
  196.               i < marks[currentSlot + 1].length &&
  197.               slots[currentSlot].getTermPosition() +
  198.               slots[currentSlot].getLength() == hits.get(i).getTermPosition();
  199.               i++){
  200.             //The next candidate starts at the right position.
  201.             //If it is valid, add it to solution and do recursive call.
  202.             //do we need a suffix?
  203.             if(currentSlot < query.min -2){
  204.               //the next hit must have a valid mark itself
  205.               if(marks[currentSlot + 1][i] >= 0){
  206.                 //copy the slots array
  207.                 Binding[] newSlots = new Binding[slots.length];
  208.                 for(int j = 0; j <= currentSlot; j++){
  209.                   newSlots[j] = slots[j];
  210.                 }
  211.                 extractHitsRec(query, results, hits, marks, currentSlot + 1, i, newSlots);
  212.               }
  213.             }else{
  214.               //we don't need to enforce a suffix -> unconditional recursive call
  215.               Binding[] newSlots = new Binding[slots.length];
  216.               for(int j = 0; j <= currentSlot; j++){
  217.                 newSlots[j] = slots[j];
  218.               }
  219.               extractHitsRec(query, results, hits, marks, currentSlot + 1, i, newSlots);
  220.             }
  221.           }
  222.         }else{
  223.           //no furhter advance possible
  224.         }
  225.       }
  226.     }
  227.    
  228.    
  229.    
  230.     /* (non-Javadoc)
  231.      * @see gate.mimir.search.query.QueryExecutor#nextHit()
  232.      */
  233.     public Binding nextHit() throws IOException {
  234.       if(closed) return null;
  235.       if(hitsOnCurrentDocument.isEmpty()){
  236.         return null;
  237.       }else{
  238.         Binding[] hitSlots = hitsOnCurrentDocument.remove(0);
  239.         int length = hitSlots[hitSlots.length - 1].getTermPosition() +
  240.             hitSlots[hitSlots.length - 1].getLength() -
  241.             hitSlots[0].getTermPosition();
  242.         return new Binding(query, hitSlots[0].getDocumentId(),
  243.                 hitSlots[0].getTermPosition(),length ,
  244.                 (engine.isSubBindingsEnabled() ? hitSlots : null));
  245.       }
  246.     }

  247.     @Override
  248.     public ReferenceSet<Index> indices() {
  249.       return wrappedExecutor.indices();
  250.     }
  251.    
  252.     public <T> T accept( final DocumentIteratorVisitor<T> visitor ) throws IOException {
  253.       if ( ! visitor.visitPre( this ) ) return null;
  254.       final T[] a = visitor.newArray( 1 );
  255.       if ( a == null ) {
  256.         if ( wrappedExecutor.accept( visitor ) == null ) return null;
  257.       }
  258.       else {
  259.         if ( ( a[ 0 ] = wrappedExecutor.accept( visitor ) ) == null ) return null;
  260.       }
  261.       return visitor.visitPost( this, a );
  262.     }

  263.     public <T> T acceptOnTruePaths( final DocumentIteratorVisitor<T> visitor ) throws IOException {
  264.       if ( ! visitor.visitPre( this ) ) return null;
  265.       final T[] a = visitor.newArray( 1 );
  266.       if ( a == null ) {
  267.         if ( wrappedExecutor.acceptOnTruePaths( visitor ) == null ) return null;    
  268.       }
  269.       else {
  270.         if ( ( a[ 0 ] = wrappedExecutor.acceptOnTruePaths( visitor ) ) == null ) return null;
  271.       }
  272.       return visitor.visitPost( this, a );
  273.     }
  274.   }
  275.  
  276.  
  277.   /**
  278.    * The minimum number of repeats required.
  279.    */
  280.   protected int min;
  281.  
  282.   /**
  283.    * The maximum number of repeats permitted.
  284.    */
  285.   protected int max;
  286.  
  287.   /**
  288.    * The wrapped query.
  289.    */
  290.   protected QueryNode wrappedQuery;
  291.  
  292.   /**
  293.    * Creates a new repeats query.
  294.    * @param query the query to be wrapped.
  295.    * @param min the minimum number of repeats required. This value needs to be
  296.    * greater than 0.
  297.    * @param max the maximum number of repeats permitted. This value needs to be
  298.    * greater or equal to the value given for <code>min</code>.
  299.    */
  300.   public RepeatsQuery(QueryNode query, int min, int max){
  301.     this.wrappedQuery = query;
  302.     if(min <= 0) throw new IllegalArgumentException(
  303.             "The value provided for minimum (" + min +
  304.             ") is not strictly positive!");
  305.     if(max < min) throw new IllegalArgumentException(
  306.             "The value provided for maximum repeats (" + max +
  307.             ") is smaller than thee value for minimum repeats (" + min + ").");
  308.     this.min = min;
  309.     this.max = max;
  310.   }
  311.  
  312.   /* (non-Javadoc)
  313.    * @see gate.mimir.search.query.QueryNode#getQueryExecutor(gate.mimir.search.QueryEngine)
  314.    */
  315.   public QueryExecutor getQueryExecutor(QueryEngine engine) throws IOException {
  316.     return new RepeatsQueryExecutor(this, engine);
  317.   }
  318.  
  319.   public String toString() {
  320.     return "REPEATS (" + wrappedQuery.toString() + " [" + min + ".." +
  321.         max + "])";
  322.   }

  323.   public int getMin() {
  324.     return min;
  325.   }

  326.   public int getMax() {
  327.     return max;
  328.   }

  329.   public QueryNode getWrappedQuery() {
  330.     return wrappedQuery;
  331.   }
  332. }