GATEDocument.java

  1. /*
  2.  *  GATEDocument.java
  3.  *
  4.  *  Copyright (c) 2007-2011, The University of Sheffield.
  5.  *
  6.  *  This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html),
  7.  *  and is free software, licenced under the GNU Lesser General Public License,
  8.  *  Version 3, June 2007 (also included with this distribution as file
  9.  *  LICENCE-LGPL3.html).
  10.  *
  11.  *  Valentin Tablan, 24 Feb 2009
  12.  *
  13.  *  $Id: GATEDocument.java 17307 2014-02-14 11:47:27Z valyt $
  14.  */
  15. package gate.mimir.index;

  16. import gate.Annotation;
  17. import gate.AnnotationSet;
  18. import gate.mimir.IndexConfig;
  19. import gate.util.OffsetComparator;
  20. import it.unimi.dsi.io.WordReader;
  21. import it.unimi.dsi.lang.MutableString;
  22. import it.unimi.di.big.mg4j.document.Document;

  23. import java.io.*;
  24. import java.util.Arrays;
  25. import java.util.concurrent.BlockingQueue;

  26. import org.slf4j.Logger;
  27. import org.slf4j.LoggerFactory;


  28. /**
  29.  * An implementation of MG4J Document interface for representing GATE documents
  30.  * during the indexing process.
  31.  */
  32. public class GATEDocument implements Document {
  33.   /**
  34.    * The URI prefix used for generating document URIs, when no explicit URI is
  35.    * provided as a document feature.
  36.    * The actual URIs will comprise this value with a number appended, generated
  37.    * by {@link #documentID}++.
  38.    */
  39.   private static final String DOCUMENT_URI_PREFIX = "urn:mimir:document:";

  40.   /**
  41.    * A reader used to satisfy the MG4J interfaces, but that provides no actual
  42.    * data.
  43.    */
  44.   private static final Reader emptyReader = new StringReader("");

  45.  
  46.   private static Logger logger = LoggerFactory.getLogger(GATEDocument.class);
  47.  
  48.   /**
  49.    * Used to generate unique document URIs, if no URIs are provided as document
  50.    * features.
  51.    */
  52.   private static long documentID = 0;
  53.  
  54.   /**
  55.    * The number of occurrences (in all sub-indexes) generated as a result of
  56.    * indexing this document.
  57.    */
  58.   private long occurrences = 0;
  59.  
  60.   /**
  61.    * An MG4J word reader for this document.
  62.    */
  63.   private class GATEDocumentWordReader implements WordReader{
  64.     /**
  65.      * the index of the next token
  66.      */
  67.     private int index = 0;
  68.    
  69.     /**
  70.      * The token feature from which the data is read.
  71.      */
  72.     private String tokenFeature;
  73.    
  74.     /**
  75.      * Constructs a GATE Document reader.
  76.      * @param tokens an array of token annotations, sorted by offset.  
  77.      * @param nonTokens an array of string, representing the non-tokens (the
  78.      * document content between tokens).
  79.      * @param tokenFeature the name of the feature to be read from the token
  80.      * annotations.
  81.      */
  82.     public GATEDocumentWordReader(String tokenFeature){
  83.       this.tokenFeature = tokenFeature;
  84.     }
  85.    
  86.     /* (non-Javadoc)
  87.      * @see it.unimi.dsi.io.WordReader#copy()
  88.      */
  89.     public WordReader copy() {
  90.       return this;
  91.     }

  92.     /* (non-Javadoc)
  93.      * @see it.unimi.dsi.io.WordReader#next(it.unimi.dsi.lang.MutableString, it.unimi.dsi.lang.MutableString)
  94.      */
  95.     public boolean next(MutableString word, MutableString nonWord)
  96.             throws IOException {
  97.       if(index < tokenAnnots.length){
  98.         word.replace((String)tokenAnnots[index].getFeatures().get(tokenFeature));
  99.         nonWord.replace(nonTokens[index]);
  100.         index++;
  101.         return true;
  102.       }else{
  103.         return false;  
  104.       }
  105.     }

  106.     /* (non-Javadoc)
  107.      * @see it.unimi.dsi.io.WordReader#setReader(java.io.Reader)
  108.      */
  109.     public WordReader setReader(Reader reader) {
  110.       if(reader != emptyReader)
  111.         throw new UnsupportedOperationException(getClass().getName() +
  112.               " does not support resetting!");
  113.       return this;
  114.     }
  115.    
  116.   }
  117.  
  118.   /**
  119.    * The index config for this document
  120.    */
  121.   private IndexConfig indexConfig;
  122.  
  123.   /**
  124.    * The queue where this document should add itself upon closing.
  125.    */
  126.   private BlockingQueue<GATEDocument> outputQueue;
  127.  
  128.   /**
  129.    * The GATE Document wrapped by this object.
  130.    */
  131.   private gate.Document gateDocument;
  132.  
  133.   /**
  134.    * A list of all the token annotations, sorted by offset.
  135.    */
  136.   private Annotation[] tokenAnnots;
  137.  
  138.   /**
  139.    * A list containing all the strings between tokens.
  140.    */
  141.   private String[] nonTokens;
  142.  
  143.   /**
  144.    * A special instance of GATEDocument used to mark the end of a queue.
  145.    */
  146.   public static final GATEDocument END_OF_QUEUE = new GATEDocument();
  147.  
  148.   /**
  149.    * Private constructor used to create the {@link #END_OF_QUEUE} instance.
  150.    */
  151.   protected GATEDocument(){
  152.   }
  153.  
  154.   public GATEDocument(gate.Document gateDocument,
  155.           IndexConfig indexConfig){
  156.     this.gateDocument = gateDocument;
  157.     this.indexConfig = indexConfig;
  158.    
  159.     //build the list of tokens
  160.     AnnotationSet tokenSet = indexConfig.getTokenAnnotationSetName() == null?
  161.             gateDocument.getAnnotations() :
  162.             gateDocument.getAnnotations(indexConfig.getTokenAnnotationSetName());  
  163.     AnnotationSet allTokens = null;
  164.     if(tokenSet != null) {
  165.       synchronized(tokenSet) {
  166.         allTokens = tokenSet.get(indexConfig
  167.                         .getTokenAnnotationType());
  168.       }
  169.     }
  170.     if(allTokens != null && allTokens.size() > 0){
  171.       //we have some tokens
  172.       tokenAnnots = allTokens.toArray(new Annotation[allTokens.size()]);
  173.       Arrays.sort(tokenAnnots, new OffsetComparator());
  174.     }else{
  175.       //no tokens
  176.       tokenAnnots = new Annotation[0];
  177.     }
  178.     //build the list of non-tokens
  179.     nonTokens = new String[tokenAnnots.length];
  180.     String docContent = gateDocument.getContent().toString();
  181.     //for each token, add the doc content after it (and before the next token)
  182.     //to the nonTokens array.
  183.     for(int i = 0; i < tokenAnnots.length - 1; i++){
  184.       int nonTokenStart = tokenAnnots[i].getEndNode().getOffset().intValue();
  185.       int nonTokenEnd = tokenAnnots[i+1].getStartNode().getOffset().intValue();
  186.       nonTokens[i] = (nonTokenStart < nonTokenEnd) ?
  187.               docContent.substring(nonTokenStart, nonTokenEnd) : "";
  188.     }
  189.     //set the last value to all remaining document content, if we have any tokens
  190.     if(tokenAnnots.length > 0){
  191.       int nonTokenStart = tokenAnnots[tokenAnnots.length - 1].getEndNode().
  192.           getOffset().intValue();
  193.       nonTokens[nonTokens.length -1] = (nonTokenStart < docContent.length()) ?
  194.               docContent.substring(nonTokenStart) : "";
  195.     }
  196.   }
  197.  
  198.   /* (non-Javadoc)
  199.    * @see it.unimi.dsi.mg4j.document.Document#close()
  200.    */
  201.   public void close() throws IOException {
  202.     // put the finished document in the output queue
  203.     try {
  204.       outputQueue.put(this);
  205.     } catch(InterruptedException e) {
  206.       Thread.currentThread().interrupt();
  207.     }
  208.   }

  209.  
  210.   /**
  211.    * Sets the output queue for this document. When the {@link #close()} method
  212.    * is called, this document will add itself to the output queue.  
  213.    * @param outputQueue the outputQueue to set
  214.    */
  215.   public void setOutputQueue(BlockingQueue<GATEDocument> outputQueue) {
  216.     this.outputQueue = outputQueue;
  217.   }

  218.  
  219.   /**
  220.    * Obtains the GATE document wrapped by this object.
  221.    * @return the gateDocument
  222.    */
  223.   public gate.Document getDocument() {
  224.     return gateDocument;
  225.   }

  226.   /* (non-Javadoc)
  227.    * @see it.unimi.dsi.mg4j.document.Document#content(int)
  228.    */
  229.   public Object content(int field) throws IOException {
  230.     return emptyReader;
  231.   }

  232.   /* (non-Javadoc)
  233.    * @see it.unimi.dsi.mg4j.document.Document#title()
  234.    */
  235.   public CharSequence title() {
  236.     return gateDocument.getName();
  237.   }

  238.   /* (non-Javadoc)
  239.    * @see it.unimi.dsi.mg4j.document.Document#uri()
  240.    */
  241.   public synchronized CharSequence uri() {
  242.     String uri = (String)gateDocument.getFeatures().get(
  243.             indexConfig.getDocumentUriFeatureName());
  244.     if(uri == null){
  245.       uri = DOCUMENT_URI_PREFIX + documentID;
  246.       logger.warn(
  247.         "No document URI provided, generating a default one: " + documentID);
  248.       documentID++;
  249.       gateDocument.getFeatures().put(
  250.               indexConfig.getDocumentUriFeatureName(), uri);
  251.     }
  252.     return uri;
  253.   }

  254.   /**
  255.    * Notifies this GATEDocument that some more index occurrences were produced
  256.    * in the process of indexing it.
  257.    *
  258.    * This method is synchronized because the same GATEDocument instance is being
  259.    * indexed in parallel by multiple sub-indexers.
  260.    *  
  261.    * @param newOccurrences the number of new occurrences generated
  262.    */
  263.   public synchronized void addOccurrences(long newOccurrences) {
  264.     occurrences += newOccurrences;
  265.   }
  266.  
  267.   /**
  268.    * Returns the number of index occurrences that the indexing of this
  269.    * GATEDocument has generated. This value is only correct after the document
  270.    * has been indexed by all sub-indexers.
  271.    *
  272.    * @return the number of occurrences.
  273.    */
  274.   public long getOccurrences() {
  275.     return occurrences;
  276.   }

  277.   /* (non-Javadoc)
  278.    * @see it.unimi.dsi.mg4j.document.Document#wordReader(int)
  279.    */
  280.   public WordReader wordReader(int field) {
  281.     return new GATEDocumentWordReader(
  282.             indexConfig.getTokenIndexers()[field].getFeatureName());
  283.   }

  284.   /**
  285.    * Gets the array of offset-sorted token annotations for this document.
  286.    * The value returned is the actual internally used array, so modifications
  287.    * can lead to undefined behaviour!
  288.    * @return the tokenAnnots
  289.    */
  290.   public Annotation[] getTokenAnnots() {
  291.     return tokenAnnots;
  292.   }

  293.   /**
  294.    * Gets the array of string representing the document content segments between
  295.    * the token annotations.
  296.    * The value returned is the actual internally used array, so modifications
  297.    * can lead to undefined behaviour!
  298.    * @return the nonTokens
  299.    */
  300.   public String[] getNonTokens() {
  301.     return nonTokens;
  302.   }
  303.  
  304.  
  305. }