GATEDocument.java
- /*
- * GATEDocument.java
- *
- * Copyright (c) 2007-2011, The University of Sheffield.
- *
- * This file is part of GATE MÃmir (see http://gate.ac.uk/family/mimir.html),
- * and is free software, licenced under the GNU Lesser General Public License,
- * Version 3, June 2007 (also included with this distribution as file
- * LICENCE-LGPL3.html).
- *
- * Valentin Tablan, 24 Feb 2009
- *
- * $Id: GATEDocument.java 17307 2014-02-14 11:47:27Z valyt $
- */
- package gate.mimir.index;
- import gate.Annotation;
- import gate.AnnotationSet;
- import gate.mimir.IndexConfig;
- import gate.util.OffsetComparator;
- import it.unimi.dsi.io.WordReader;
- import it.unimi.dsi.lang.MutableString;
- import it.unimi.di.big.mg4j.document.Document;
- import java.io.*;
- import java.util.Arrays;
- import java.util.concurrent.BlockingQueue;
- import org.slf4j.Logger;
- import org.slf4j.LoggerFactory;
- /**
- * An implementation of MG4J Document interface for representing GATE documents
- * during the indexing process.
- */
- public class GATEDocument implements Document {
- /**
- * The URI prefix used for generating document URIs, when no explicit URI is
- * provided as a document feature.
- * The actual URIs will comprise this value with a number appended, generated
- * by {@link #documentID}++.
- */
- private static final String DOCUMENT_URI_PREFIX = "urn:mimir:document:";
- /**
- * A reader used to satisfy the MG4J interfaces, but that provides no actual
- * data.
- */
- private static final Reader emptyReader = new StringReader("");
-
- private static Logger logger = LoggerFactory.getLogger(GATEDocument.class);
-
- /**
- * Used to generate unique document URIs, if no URIs are provided as document
- * features.
- */
- private static long documentID = 0;
-
- /**
- * The number of occurrences (in all sub-indexes) generated as a result of
- * indexing this document.
- */
- private long occurrences = 0;
-
- /**
- * An MG4J word reader for this document.
- */
- private class GATEDocumentWordReader implements WordReader{
- /**
- * the index of the next token
- */
- private int index = 0;
-
- /**
- * The token feature from which the data is read.
- */
- private String tokenFeature;
-
- /**
- * Constructs a GATE Document reader.
- * @param tokens an array of token annotations, sorted by offset.
- * @param nonTokens an array of string, representing the non-tokens (the
- * document content between tokens).
- * @param tokenFeature the name of the feature to be read from the token
- * annotations.
- */
- public GATEDocumentWordReader(String tokenFeature){
- this.tokenFeature = tokenFeature;
- }
-
- /* (non-Javadoc)
- * @see it.unimi.dsi.io.WordReader#copy()
- */
- public WordReader copy() {
- return this;
- }
- /* (non-Javadoc)
- * @see it.unimi.dsi.io.WordReader#next(it.unimi.dsi.lang.MutableString, it.unimi.dsi.lang.MutableString)
- */
- public boolean next(MutableString word, MutableString nonWord)
- throws IOException {
- if(index < tokenAnnots.length){
- word.replace((String)tokenAnnots[index].getFeatures().get(tokenFeature));
- nonWord.replace(nonTokens[index]);
- index++;
- return true;
- }else{
- return false;
- }
- }
- /* (non-Javadoc)
- * @see it.unimi.dsi.io.WordReader#setReader(java.io.Reader)
- */
- public WordReader setReader(Reader reader) {
- if(reader != emptyReader)
- throw new UnsupportedOperationException(getClass().getName() +
- " does not support resetting!");
- return this;
- }
-
- }
-
- /**
- * The index config for this document
- */
- private IndexConfig indexConfig;
-
- /**
- * The queue where this document should add itself upon closing.
- */
- private BlockingQueue<GATEDocument> outputQueue;
-
- /**
- * The GATE Document wrapped by this object.
- */
- private gate.Document gateDocument;
-
- /**
- * A list of all the token annotations, sorted by offset.
- */
- private Annotation[] tokenAnnots;
-
- /**
- * A list containing all the strings between tokens.
- */
- private String[] nonTokens;
-
- /**
- * A special instance of GATEDocument used to mark the end of a queue.
- */
- public static final GATEDocument END_OF_QUEUE = new GATEDocument();
-
- /**
- * Private constructor used to create the {@link #END_OF_QUEUE} instance.
- */
- protected GATEDocument(){
- }
-
- public GATEDocument(gate.Document gateDocument,
- IndexConfig indexConfig){
- this.gateDocument = gateDocument;
- this.indexConfig = indexConfig;
-
- //build the list of tokens
- AnnotationSet tokenSet = indexConfig.getTokenAnnotationSetName() == null?
- gateDocument.getAnnotations() :
- gateDocument.getAnnotations(indexConfig.getTokenAnnotationSetName());
- AnnotationSet allTokens = null;
- if(tokenSet != null) {
- synchronized(tokenSet) {
- allTokens = tokenSet.get(indexConfig
- .getTokenAnnotationType());
- }
- }
- if(allTokens != null && allTokens.size() > 0){
- //we have some tokens
- tokenAnnots = allTokens.toArray(new Annotation[allTokens.size()]);
- Arrays.sort(tokenAnnots, new OffsetComparator());
- }else{
- //no tokens
- tokenAnnots = new Annotation[0];
- }
- //build the list of non-tokens
- nonTokens = new String[tokenAnnots.length];
- String docContent = gateDocument.getContent().toString();
- //for each token, add the doc content after it (and before the next token)
- //to the nonTokens array.
- for(int i = 0; i < tokenAnnots.length - 1; i++){
- int nonTokenStart = tokenAnnots[i].getEndNode().getOffset().intValue();
- int nonTokenEnd = tokenAnnots[i+1].getStartNode().getOffset().intValue();
- nonTokens[i] = (nonTokenStart < nonTokenEnd) ?
- docContent.substring(nonTokenStart, nonTokenEnd) : "";
- }
- //set the last value to all remaining document content, if we have any tokens
- if(tokenAnnots.length > 0){
- int nonTokenStart = tokenAnnots[tokenAnnots.length - 1].getEndNode().
- getOffset().intValue();
- nonTokens[nonTokens.length -1] = (nonTokenStart < docContent.length()) ?
- docContent.substring(nonTokenStart) : "";
- }
- }
-
- /* (non-Javadoc)
- * @see it.unimi.dsi.mg4j.document.Document#close()
- */
- public void close() throws IOException {
- // put the finished document in the output queue
- try {
- outputQueue.put(this);
- } catch(InterruptedException e) {
- Thread.currentThread().interrupt();
- }
- }
-
- /**
- * Sets the output queue for this document. When the {@link #close()} method
- * is called, this document will add itself to the output queue.
- * @param outputQueue the outputQueue to set
- */
- public void setOutputQueue(BlockingQueue<GATEDocument> outputQueue) {
- this.outputQueue = outputQueue;
- }
-
- /**
- * Obtains the GATE document wrapped by this object.
- * @return the gateDocument
- */
- public gate.Document getDocument() {
- return gateDocument;
- }
- /* (non-Javadoc)
- * @see it.unimi.dsi.mg4j.document.Document#content(int)
- */
- public Object content(int field) throws IOException {
- return emptyReader;
- }
- /* (non-Javadoc)
- * @see it.unimi.dsi.mg4j.document.Document#title()
- */
- public CharSequence title() {
- return gateDocument.getName();
- }
- /* (non-Javadoc)
- * @see it.unimi.dsi.mg4j.document.Document#uri()
- */
- public synchronized CharSequence uri() {
- String uri = (String)gateDocument.getFeatures().get(
- indexConfig.getDocumentUriFeatureName());
- if(uri == null){
- uri = DOCUMENT_URI_PREFIX + documentID;
- logger.warn(
- "No document URI provided, generating a default one: " + documentID);
- documentID++;
- gateDocument.getFeatures().put(
- indexConfig.getDocumentUriFeatureName(), uri);
- }
- return uri;
- }
- /**
- * Notifies this GATEDocument that some more index occurrences were produced
- * in the process of indexing it.
- *
- * This method is synchronized because the same GATEDocument instance is being
- * indexed in parallel by multiple sub-indexers.
- *
- * @param newOccurrences the number of new occurrences generated
- */
- public synchronized void addOccurrences(long newOccurrences) {
- occurrences += newOccurrences;
- }
-
- /**
- * Returns the number of index occurrences that the indexing of this
- * GATEDocument has generated. This value is only correct after the document
- * has been indexed by all sub-indexers.
- *
- * @return the number of occurrences.
- */
- public long getOccurrences() {
- return occurrences;
- }
- /* (non-Javadoc)
- * @see it.unimi.dsi.mg4j.document.Document#wordReader(int)
- */
- public WordReader wordReader(int field) {
- return new GATEDocumentWordReader(
- indexConfig.getTokenIndexers()[field].getFeatureName());
- }
- /**
- * Gets the array of offset-sorted token annotations for this document.
- * The value returned is the actual internally used array, so modifications
- * can lead to undefined behaviour!
- * @return the tokenAnnots
- */
- public Annotation[] getTokenAnnots() {
- return tokenAnnots;
- }
- /**
- * Gets the array of string representing the document content segments between
- * the token annotations.
- * The value returned is the actual internally used array, so modifications
- * can lead to undefined behaviour!
- * @return the nonTokens
- */
- public String[] getNonTokens() {
- return nonTokens;
- }
-
-
- }