RepeatsQuery.java
/*
* RepeatsQuery.java
*
* Copyright (c) 2007-2011, The University of Sheffield.
*
* This file is part of GATE MÃmir (see http://gate.ac.uk/family/mimir.html),
* and is free software, licenced under the GNU Lesser General Public License,
* Version 3, June 2007 (also included with this distribution as file
* LICENCE-LGPL3.html).
*
* Valentin Tablan, 17 Mar 2009
*
* $Id: RepeatsQuery.java 20208 2017-04-19 08:35:28Z domrout $
*/
package gate.mimir.search.query;
import gate.mimir.search.QueryEngine;
import it.unimi.dsi.fastutil.objects.ReferenceSet;
import it.unimi.di.big.mg4j.index.Index;
import it.unimi.di.big.mg4j.search.visitor.DocumentIteratorVisitor;
import java.io.IOException;
import java.util.*;
/**
* A query node that wraps another query node. Results for this query are
* sequences of consecutive results from the wrapped query, with lengths
* between specified minimum and maximum values.
* This is similar to a bounded Kleene operator, e.g. {Token}[2,3] in JAPE
* notation.
*/
public class RepeatsQuery implements QueryNode {
private static final long serialVersionUID = 8441324338156901268L;
/**
* A {@link QueryExecutor} for repeats queries.
*/
public static class RepeatsQueryExecutor extends AbstractQueryExecutor{
/**
* @param engine
* @param query
* @throws IOException
*/
public RepeatsQueryExecutor(RepeatsQuery query, QueryEngine engine) throws IOException {
super(engine, query);
this.query = query;
this.wrappedExecutor = query.wrappedQuery.getQueryExecutor(engine);
hitsOnCurrentDocument = new LinkedList<Binding[]>();
}
/**
* Holds the hits on the current document. This data structure is populated
* by the {@link #nextDocument(int)} method, and is consumed by the
* {@link #nextHit()} method.
* Each entry is a hit (represented as a sequence of consecutive hits from
* the wrapped query).
*/
protected List<Binding[]> hitsOnCurrentDocument;
/**
* The query being executed.
*/
protected RepeatsQuery query;
/**
* The executor for the query wrapped by this RepeastQuery.
*/
protected QueryExecutor wrappedExecutor;
/* (non-Javadoc)
* @see gate.mimir.search.query.QueryExecutor#close()
*/
public void close() throws IOException {
if(closed) return;
super.close();
wrappedExecutor.close();
hitsOnCurrentDocument.clear();
}
/* (non-Javadoc)
* @see gate.mimir.search.query.QueryExecutor#nextDocument(int)
*/
public long nextDocument(long greaterThan) throws IOException {
if(closed) return latestDocument = -1;
if(latestDocument == -1) return latestDocument;
long nextDoc = -1;
while(nextDoc < 0){
nextDoc = wrappedExecutor.nextDocument(greaterThan);
if(nextDoc < 0){
//no more docs
return latestDocument = nextDoc;
}
getHitsOnCurrentDocument();
if(hitsOnCurrentDocument.isEmpty()) nextDoc = -1;
}
return latestDocument = nextDoc;
}
protected void getHitsOnCurrentDocument() throws IOException{
//extract all hits on the current document
List<Binding> hits = new ArrayList<Binding>();
Binding aHit = wrappedExecutor.nextHit();
while(aHit != null){
hits.add(aHit);
aHit = wrappedExecutor.nextHit();
}
//calculate the marks for each hit, at each position in the sequence
int[][] marks = new int[query.max][];
for(int i = 0; i < marks.length; i++){
marks[i] = new int[hits.size()];
}
//we mark from right to left, starting from the penultimate list
//for the last list, all elements are marked as valid
//VT: next lines removed, as Java initialises arrays with 0
// for(int i = 0; i < marks[marks.length -1].length; i++){
// marks[marks.length -1][i] = 0;
// }
int currentSlot = marks.length -2;
while(currentSlot >= 0){
//Should we only mark an entry if its next slot has a suffix itself?
boolean suffixCompulsory = currentSlot < query.min -2;
for(int i = 0 ; i < hits.size(); i++ ){
//for each candidate hit
int nextStart = hits.get(i).getTermPosition() +
hits.get(i).getLength();
//find the first hit in the next list and store its pointer
//search for the first valid next hit
//we start from the current hit position +1 (hits are ordered, and a
//hit cannot follow itself)
int nextHitIdx = i + 1;
//first skip the hits too small
while(nextHitIdx < marks[currentSlot + 1].length &&
hits.get(nextHitIdx).getTermPosition() < nextStart){
nextHitIdx++;
}
//next process all candidates, until we find a valid one, or we run out
marks[currentSlot][i] = -1;
while(nextHitIdx < marks[currentSlot + 1].length &&
hits.get(nextHitIdx).getTermPosition() == nextStart){
//A good hit is one that:
// - starts at nextStart, and
// - if suffixCompulsory, has a non-negative mark itself
if(suffixCompulsory){
if(marks[currentSlot +1][nextHitIdx] >= 0){
//we found the next hit
marks[currentSlot][i] = nextHitIdx;
break;
}else{
nextHitIdx++;
}
}else{
//no other conditions required
marks[currentSlot][i] = nextHitIdx;
break;
}
}
}//for(int i = 0 ; i < hits.size(); i++ )
currentSlot--;
}//while(currentSlot >= 0)
//we finished marking, now we collect results from left to right
hitsOnCurrentDocument.clear();
//for each marked starting point on the first slot, start enumerating the
//hits
for(int i = 0; i < marks[0].length; i++){
if(marks[0][i] >= 0 || query.min == 1){
Binding[] slots = new Binding[query.max];
extractHitsRec(query, hitsOnCurrentDocument, hits, marks, 0, i, slots);
}
}
}
/**
* Recursively extracts the hits
* @param hits the arrays of hits, one row for each of the {@link #executorsCache}.
* @param marks the marks for the hits
* @param currentSlot which slot to fill at this stage (used for recursion).
* @param currentHit for the current slot, which candidate hit to start from.
* @param slots the array of slots that needs filling.
* @see #computeMark(int, int, Binding[], int[])
*/
protected static void extractHitsRec(RepeatsQuery query,
List<Binding[]> results, List<Binding> hits, int[][] marks,
int currentSlot, int currentHit, Binding[] slots){
slots[currentSlot] = hits.get(currentHit);
if(currentSlot >= query.min -1){
//we have a full result -> make a copy of the right length and add it
Binding[] aResult = new Binding[currentSlot +1];
for(int i = 0; i<= currentSlot; i++) aResult[i] = slots[i];
results.add(aResult);
}
//if we're not at the end yet, keep recursing
if(currentSlot < query.max -1){
if(marks[currentSlot][currentHit] >=0){
//recursive call for the first next candidate
extractHitsRec(query, results, hits, marks, currentSlot + 1,
marks[currentSlot][currentHit], slots);
//now find all other candidates for next slot
for(int i = marks[currentSlot][currentHit] +1;
i < marks[currentSlot + 1].length &&
slots[currentSlot].getTermPosition() +
slots[currentSlot].getLength() == hits.get(i).getTermPosition();
i++){
//The next candidate starts at the right position.
//If it is valid, add it to solution and do recursive call.
//do we need a suffix?
if(currentSlot < query.min -2){
//the next hit must have a valid mark itself
if(marks[currentSlot + 1][i] >= 0){
//copy the slots array
Binding[] newSlots = new Binding[slots.length];
for(int j = 0; j <= currentSlot; j++){
newSlots[j] = slots[j];
}
extractHitsRec(query, results, hits, marks, currentSlot + 1, i, newSlots);
}
}else{
//we don't need to enforce a suffix -> unconditional recursive call
Binding[] newSlots = new Binding[slots.length];
for(int j = 0; j <= currentSlot; j++){
newSlots[j] = slots[j];
}
extractHitsRec(query, results, hits, marks, currentSlot + 1, i, newSlots);
}
}
}else{
//no furhter advance possible
}
}
}
/* (non-Javadoc)
* @see gate.mimir.search.query.QueryExecutor#nextHit()
*/
public Binding nextHit() throws IOException {
if(closed) return null;
if(hitsOnCurrentDocument.isEmpty()){
return null;
}else{
Binding[] hitSlots = hitsOnCurrentDocument.remove(0);
int length = hitSlots[hitSlots.length - 1].getTermPosition() +
hitSlots[hitSlots.length - 1].getLength() -
hitSlots[0].getTermPosition();
return new Binding(query, hitSlots[0].getDocumentId(),
hitSlots[0].getTermPosition(),length ,
(engine.isSubBindingsEnabled() ? hitSlots : null));
}
}
@Override
public ReferenceSet<Index> indices() {
return wrappedExecutor.indices();
}
public <T> T accept( final DocumentIteratorVisitor<T> visitor ) throws IOException {
if ( ! visitor.visitPre( this ) ) return null;
final T[] a = visitor.newArray( 1 );
if ( a == null ) {
if ( wrappedExecutor.accept( visitor ) == null ) return null;
}
else {
if ( ( a[ 0 ] = wrappedExecutor.accept( visitor ) ) == null ) return null;
}
return visitor.visitPost( this, a );
}
public <T> T acceptOnTruePaths( final DocumentIteratorVisitor<T> visitor ) throws IOException {
if ( ! visitor.visitPre( this ) ) return null;
final T[] a = visitor.newArray( 1 );
if ( a == null ) {
if ( wrappedExecutor.acceptOnTruePaths( visitor ) == null ) return null;
}
else {
if ( ( a[ 0 ] = wrappedExecutor.acceptOnTruePaths( visitor ) ) == null ) return null;
}
return visitor.visitPost( this, a );
}
}
/**
* The minimum number of repeats required.
*/
protected int min;
/**
* The maximum number of repeats permitted.
*/
protected int max;
/**
* The wrapped query.
*/
protected QueryNode wrappedQuery;
/**
* Creates a new repeats query.
* @param query the query to be wrapped.
* @param min the minimum number of repeats required. This value needs to be
* greater than 0.
* @param max the maximum number of repeats permitted. This value needs to be
* greater or equal to the value given for <code>min</code>.
*/
public RepeatsQuery(QueryNode query, int min, int max){
this.wrappedQuery = query;
if(min <= 0) throw new IllegalArgumentException(
"The value provided for minimum (" + min +
") is not strictly positive!");
if(max < min) throw new IllegalArgumentException(
"The value provided for maximum repeats (" + max +
") is smaller than thee value for minimum repeats (" + min + ").");
this.min = min;
this.max = max;
}
/* (non-Javadoc)
* @see gate.mimir.search.query.QueryNode#getQueryExecutor(gate.mimir.search.QueryEngine)
*/
public QueryExecutor getQueryExecutor(QueryEngine engine) throws IOException {
return new RepeatsQueryExecutor(this, engine);
}
public String toString() {
return "REPEATS (" + wrappedQuery.toString() + " [" + min + ".." +
max + "])";
}
public int getMin() {
return min;
}
public int getMax() {
return max;
}
public QueryNode getWrappedQuery() {
return wrappedQuery;
}
}