AtomicAnnotationIndex.java

/*
 *  AtomicMentionsIndex.java
 *
 *  Copyright (c) 2007-2014, The University of Sheffield.
 *
 *  This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html), 
 *  and is free software, licenced under the GNU Lesser General Public License,
 *  Version 3, June 2007 (also included with this distribution as file
 *  LICENCE-LGPL3.html).
 *
 *  Valentin Tablan, 10 Jan 2014
 *
 *  $Id: AtomicAnnotationIndex.java 18573 2015-02-12 19:07:12Z ian_roberts $
 */
package gate.mimir.index;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.FeatureMap;
import gate.Node;
import gate.annotation.AnnotationImpl;
import gate.event.AnnotationListener;
import gate.mimir.IndexConfig;
import gate.mimir.MimirIndex;
import gate.mimir.SemanticAnnotationHelper;
import gate.mimir.IndexConfig.SemanticIndexerConfig;
import gate.util.OffsetComparator;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.BlockingQueue;

import it.unimi.di.big.mg4j.index.Index;
import it.unimi.di.big.mg4j.index.NullTermProcessor;
import it.unimi.dsi.lang.ObjectParser;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 *
 */
public class AtomicAnnotationIndex extends AtomicIndex {
  
  private final static Logger logger = LoggerFactory.getLogger(AtomicAnnotationIndex.class);
  
  /**
   * A simple object of type {@link Annotation} that can be used as a special marker.
   */
  private static class ConstAnnotation extends AnnotationImpl {
    private static final long serialVersionUID = 8224738902788616055L;
    public ConstAnnotation() {
      super(null, null, null, null, null);
    }
  }
  
  private static final Annotation DOCUMENT_VIRTUAL_ANN = new ConstAnnotation();
  
  /**
   * The {@link IndexConfig} used by the {@link MimirIndex} that contains this
   * mentions index.
   */
  protected IndexConfig indexConfig;
  
  protected SemanticIndexerConfig semIdxConfid;
  /**
   * Helpers for each semantic annotation type.
   */
  protected Map<String, SemanticAnnotationHelper> annotationHelpers;
  
  protected List<SemanticAnnotationHelper> documentHelpers;
  
  /**
   * An {@link OffsetComparator} used to sort the annotations by offset before 
   * indexing.
   */
  protected OffsetComparator offsetComparator;
  
  /**
   * Creates a new atomic index for indexing annotations.
   * @param parent the top level {@link MimirIndex} to which this new atomic 
   * index belongs.
   * @param name the name for the new atomic index. This will be used as the
   * name of the top level directory for this atomic index (which is a 
   * sub-directory of the parent) and as a base name for all the files of this 
   * atomic index.
   * @param hasDirectIndex should a direct index be created as well.
   * @param inputQueue the queue where documents are submitted for indexing;
   * @param outputQueue the queue where indexed documents are returned to;
   * @throws IndexException 
   * @throws IOException 
   */
  public AtomicAnnotationIndex(MimirIndex parent, String name, 
      boolean hasDirectIndex,
      BlockingQueue<GATEDocument> inputQueue,
      BlockingQueue<GATEDocument> outputQueue,
      SemanticIndexerConfig siConfig) throws IOException, IndexException {
    super(parent, name, hasDirectIndex, 
        NullTermProcessor.getInstance(), inputQueue, outputQueue);
    this.semIdxConfid = siConfig;
    indexConfig = parent.getIndexConfig();
    //get the helpers
    annotationHelpers = new HashMap<String, SemanticAnnotationHelper>(
              siConfig.getAnnotationTypes().length);
    documentHelpers = new LinkedList<SemanticAnnotationHelper>();
    for(int i = 0; i <  siConfig.getAnnotationTypes().length; i++){
      SemanticAnnotationHelper theHelper = siConfig.getHelpers()[i];
      theHelper.init(this);
      if(theHelper.getMode() == SemanticAnnotationHelper.Mode.DOCUMENT) {
        documentHelpers.add(theHelper);
      } else {
        annotationHelpers.put(siConfig.getAnnotationTypes()[i], theHelper);  
      }
    }
    offsetComparator = new OffsetComparator();
    // start the indexing thread
    indexingThread = new Thread(this, "Mimir-" + name + " indexing thread");
    indexingThread.start();
  }

  
  
  @Override
  protected void documentStarting(GATEDocument gateDocument)
      throws IndexException {
    for(SemanticAnnotationHelper aHelper : annotationHelpers.values()){
      aHelper.documentStart(gateDocument.getDocument());
    }
    for(SemanticAnnotationHelper aHelper : documentHelpers){
      aHelper.documentStart(gateDocument.getDocument());
    }
  }

  @Override
  protected void documentEnding(GATEDocument gateDocument)
      throws IndexException {
    for(SemanticAnnotationHelper aHelper : annotationHelpers.values()){
      aHelper.documentEnd();
    }
    // index the document mode annotations
    if(!documentHelpers.isEmpty()) {
      processAnnotation(DOCUMENT_VIRTUAL_ANN, gateDocument);
    }
    for(SemanticAnnotationHelper aHelper : documentHelpers){     
      aHelper.documentEnd();
    }
  }



  /* (non-Javadoc)
   * @see gate.mimir.index.AtomicIndex#getAnnotsToProcess(gate.mimir.index.mg4j.GATEDocument)
   */
  @Override
  protected Annotation[] getAnnotsToProcess(GATEDocument gateDocument)
      throws IndexException {
    Document document = gateDocument.getDocument();
    Annotation[] semanticAnnots;
    AnnotationSet semAnnSet = 
      (indexConfig.getSemanticAnnotationSetName() == null ||
      indexConfig.getSemanticAnnotationSetName().length() == 0) ?
      document.getAnnotations() :
      document.getAnnotations(indexConfig.getSemanticAnnotationSetName());
    if(semAnnSet.size() > 0){
      AnnotationSet semAnns = null;
      synchronized(semAnnSet) {
        semAnns = semAnnSet.get(annotationHelpers.keySet());
      }
      semanticAnnots = semAnns.toArray(new Annotation[semAnns.size()]);
      Arrays.sort(semanticAnnots, offsetComparator);
    } else {
      semanticAnnots  = new Annotation[0];
    }
    return semanticAnnots;
  }

  /* (non-Javadoc)
   * @see gate.mimir.index.AtomicIndex#calculateStartPositionForAnnotation(gate.Annotation, gate.mimir.index.mg4j.GATEDocument)
   */
  @Override
  protected void calculateStartPositionForAnnotation(Annotation ann,
      GATEDocument gateDocument) throws IndexException {
    if(ann == DOCUMENT_VIRTUAL_ANN) {
      // we're supposed index the document metadata
      tokenPosition = 0;
    } else {
      //calculate the term position for the current semantic annotation
      while(tokenPosition <  gateDocument.getTokenAnnots().length &&
            gateDocument.getTokenAnnots()[tokenPosition].
              getEndNode().getOffset().longValue() <= 
              ann.getStartNode().getOffset().longValue()){
        tokenPosition++;
      }
      //check if lastTokenposition is valid
      if(tokenPosition >= gateDocument.getTokenAnnots().length){
        //malfunction
        logger.error(
                "Semantic annotation [Type:" + ann.getType() +
                ", start: " + ann.getStartNode().getOffset().toString() +
                ", end: " + ann.getEndNode().getOffset().toString() +
                "] outside of the tokens area in document" +
                " URI: " + gateDocument.uri() +
                " Title: " + gateDocument.title());
      }      
    }
  }

  /* (non-Javadoc)
   * @see gate.mimir.index.AtomicIndex#calculateTermStringForAnnotation(gate.Annotation, gate.mimir.index.mg4j.GATEDocument)
   */
  @Override
  protected String[] calculateTermStringForAnnotation(Annotation ann,
      GATEDocument gateDocument) throws IndexException {
    if(ann == DOCUMENT_VIRTUAL_ANN) {
      // obtain the URIs to be indexed for the *document* metadata
      List<String> terms = new LinkedList<String>();
      for(SemanticAnnotationHelper aHelper : documentHelpers) {
        String[] someTerms = aHelper.getMentionUris(null, Mention.NO_LENGTH, this);
        if(someTerms != null) {
          for(String aTerm : someTerms) {
            terms.add(aTerm);
          }
        }
      }
      return terms.toArray(new String[terms.size()]);
    } else {
      //calculate the annotation length (as number of terms)
      SemanticAnnotationHelper helper = annotationHelpers.get(ann.getType());
      int length = 1;
      while(tokenPosition + length <  gateDocument.getTokenAnnots().length &&
              gateDocument.getTokenAnnots()[tokenPosition + length].
                getStartNode().getOffset().longValue() < 
                ann.getEndNode().getOffset().longValue()){
          length++;
        }
      //get the annotation URI
      return helper.getMentionUris(ann, length, this);
    }
  }

  @Override
  protected void flush() throws IOException {
    for(SemanticAnnotationHelper sah : annotationHelpers.values()) {
      sah.close(this);
    }
  }
  
  
}