GATEDocument.java
/*
* GATEDocument.java
*
* Copyright (c) 2007-2011, The University of Sheffield.
*
* This file is part of GATE MÃmir (see http://gate.ac.uk/family/mimir.html),
* and is free software, licenced under the GNU Lesser General Public License,
* Version 3, June 2007 (also included with this distribution as file
* LICENCE-LGPL3.html).
*
* Valentin Tablan, 24 Feb 2009
*
* $Id: GATEDocument.java 17307 2014-02-14 11:47:27Z valyt $
*/
package gate.mimir.index;
import gate.Annotation;
import gate.AnnotationSet;
import gate.mimir.IndexConfig;
import gate.util.OffsetComparator;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;
import it.unimi.di.big.mg4j.document.Document;
import java.io.*;
import java.util.Arrays;
import java.util.concurrent.BlockingQueue;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* An implementation of MG4J Document interface for representing GATE documents
* during the indexing process.
*/
public class GATEDocument implements Document {
/**
* The URI prefix used for generating document URIs, when no explicit URI is
* provided as a document feature.
* The actual URIs will comprise this value with a number appended, generated
* by {@link #documentID}++.
*/
private static final String DOCUMENT_URI_PREFIX = "urn:mimir:document:";
/**
* A reader used to satisfy the MG4J interfaces, but that provides no actual
* data.
*/
private static final Reader emptyReader = new StringReader("");
private static Logger logger = LoggerFactory.getLogger(GATEDocument.class);
/**
* Used to generate unique document URIs, if no URIs are provided as document
* features.
*/
private static long documentID = 0;
/**
* The number of occurrences (in all sub-indexes) generated as a result of
* indexing this document.
*/
private long occurrences = 0;
/**
* An MG4J word reader for this document.
*/
private class GATEDocumentWordReader implements WordReader{
/**
* the index of the next token
*/
private int index = 0;
/**
* The token feature from which the data is read.
*/
private String tokenFeature;
/**
* Constructs a GATE Document reader.
* @param tokens an array of token annotations, sorted by offset.
* @param nonTokens an array of string, representing the non-tokens (the
* document content between tokens).
* @param tokenFeature the name of the feature to be read from the token
* annotations.
*/
public GATEDocumentWordReader(String tokenFeature){
this.tokenFeature = tokenFeature;
}
/* (non-Javadoc)
* @see it.unimi.dsi.io.WordReader#copy()
*/
public WordReader copy() {
return this;
}
/* (non-Javadoc)
* @see it.unimi.dsi.io.WordReader#next(it.unimi.dsi.lang.MutableString, it.unimi.dsi.lang.MutableString)
*/
public boolean next(MutableString word, MutableString nonWord)
throws IOException {
if(index < tokenAnnots.length){
word.replace((String)tokenAnnots[index].getFeatures().get(tokenFeature));
nonWord.replace(nonTokens[index]);
index++;
return true;
}else{
return false;
}
}
/* (non-Javadoc)
* @see it.unimi.dsi.io.WordReader#setReader(java.io.Reader)
*/
public WordReader setReader(Reader reader) {
if(reader != emptyReader)
throw new UnsupportedOperationException(getClass().getName() +
" does not support resetting!");
return this;
}
}
/**
* The index config for this document
*/
private IndexConfig indexConfig;
/**
* The queue where this document should add itself upon closing.
*/
private BlockingQueue<GATEDocument> outputQueue;
/**
* The GATE Document wrapped by this object.
*/
private gate.Document gateDocument;
/**
* A list of all the token annotations, sorted by offset.
*/
private Annotation[] tokenAnnots;
/**
* A list containing all the strings between tokens.
*/
private String[] nonTokens;
/**
* A special instance of GATEDocument used to mark the end of a queue.
*/
public static final GATEDocument END_OF_QUEUE = new GATEDocument();
/**
* Private constructor used to create the {@link #END_OF_QUEUE} instance.
*/
protected GATEDocument(){
}
public GATEDocument(gate.Document gateDocument,
IndexConfig indexConfig){
this.gateDocument = gateDocument;
this.indexConfig = indexConfig;
//build the list of tokens
AnnotationSet tokenSet = indexConfig.getTokenAnnotationSetName() == null?
gateDocument.getAnnotations() :
gateDocument.getAnnotations(indexConfig.getTokenAnnotationSetName());
AnnotationSet allTokens = null;
if(tokenSet != null) {
synchronized(tokenSet) {
allTokens = tokenSet.get(indexConfig
.getTokenAnnotationType());
}
}
if(allTokens != null && allTokens.size() > 0){
//we have some tokens
tokenAnnots = allTokens.toArray(new Annotation[allTokens.size()]);
Arrays.sort(tokenAnnots, new OffsetComparator());
}else{
//no tokens
tokenAnnots = new Annotation[0];
}
//build the list of non-tokens
nonTokens = new String[tokenAnnots.length];
String docContent = gateDocument.getContent().toString();
//for each token, add the doc content after it (and before the next token)
//to the nonTokens array.
for(int i = 0; i < tokenAnnots.length - 1; i++){
int nonTokenStart = tokenAnnots[i].getEndNode().getOffset().intValue();
int nonTokenEnd = tokenAnnots[i+1].getStartNode().getOffset().intValue();
nonTokens[i] = (nonTokenStart < nonTokenEnd) ?
docContent.substring(nonTokenStart, nonTokenEnd) : "";
}
//set the last value to all remaining document content, if we have any tokens
if(tokenAnnots.length > 0){
int nonTokenStart = tokenAnnots[tokenAnnots.length - 1].getEndNode().
getOffset().intValue();
nonTokens[nonTokens.length -1] = (nonTokenStart < docContent.length()) ?
docContent.substring(nonTokenStart) : "";
}
}
/* (non-Javadoc)
* @see it.unimi.dsi.mg4j.document.Document#close()
*/
public void close() throws IOException {
// put the finished document in the output queue
try {
outputQueue.put(this);
} catch(InterruptedException e) {
Thread.currentThread().interrupt();
}
}
/**
* Sets the output queue for this document. When the {@link #close()} method
* is called, this document will add itself to the output queue.
* @param outputQueue the outputQueue to set
*/
public void setOutputQueue(BlockingQueue<GATEDocument> outputQueue) {
this.outputQueue = outputQueue;
}
/**
* Obtains the GATE document wrapped by this object.
* @return the gateDocument
*/
public gate.Document getDocument() {
return gateDocument;
}
/* (non-Javadoc)
* @see it.unimi.dsi.mg4j.document.Document#content(int)
*/
public Object content(int field) throws IOException {
return emptyReader;
}
/* (non-Javadoc)
* @see it.unimi.dsi.mg4j.document.Document#title()
*/
public CharSequence title() {
return gateDocument.getName();
}
/* (non-Javadoc)
* @see it.unimi.dsi.mg4j.document.Document#uri()
*/
public synchronized CharSequence uri() {
String uri = (String)gateDocument.getFeatures().get(
indexConfig.getDocumentUriFeatureName());
if(uri == null){
uri = DOCUMENT_URI_PREFIX + documentID;
logger.warn(
"No document URI provided, generating a default one: " + documentID);
documentID++;
gateDocument.getFeatures().put(
indexConfig.getDocumentUriFeatureName(), uri);
}
return uri;
}
/**
* Notifies this GATEDocument that some more index occurrences were produced
* in the process of indexing it.
*
* This method is synchronized because the same GATEDocument instance is being
* indexed in parallel by multiple sub-indexers.
*
* @param newOccurrences the number of new occurrences generated
*/
public synchronized void addOccurrences(long newOccurrences) {
occurrences += newOccurrences;
}
/**
* Returns the number of index occurrences that the indexing of this
* GATEDocument has generated. This value is only correct after the document
* has been indexed by all sub-indexers.
*
* @return the number of occurrences.
*/
public long getOccurrences() {
return occurrences;
}
/* (non-Javadoc)
* @see it.unimi.dsi.mg4j.document.Document#wordReader(int)
*/
public WordReader wordReader(int field) {
return new GATEDocumentWordReader(
indexConfig.getTokenIndexers()[field].getFeatureName());
}
/**
* Gets the array of offset-sorted token annotations for this document.
* The value returned is the actual internally used array, so modifications
* can lead to undefined behaviour!
* @return the tokenAnnots
*/
public Annotation[] getTokenAnnots() {
return tokenAnnots;
}
/**
* Gets the array of string representing the document content segments between
* the token annotations.
* The value returned is the actual internally used array, so modifications
* can lead to undefined behaviour!
* @return the nonTokens
*/
public String[] getNonTokens() {
return nonTokens;
}
}