IndexUpgrader.java

  1. /*
  2.  *  IndexUpgrader.java
  3.  *
  4.  *  Copyright (c) 2007-2014, The University of Sheffield.
  5.  *
  6.  *  This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html),
  7.  *  and is free software, licenced under the GNU Lesser General Public License,
  8.  *  Version 3, June 2007 (also included with this distribution as file
  9.  *  LICENCE-LGPL3.html).
  10.  *
  11.  *  Valentin Tablan, 27 Feb 2014
  12.  *
  13.  *  $Id: IndexUpgrader.java 20281 2017-12-05 00:35:07Z ian_roberts $
  14.  */
  15. package gate.mimir.util;

  16. import gate.Gate;
  17. import gate.mimir.IndexConfig;
  18. import gate.mimir.IndexConfig.SemanticIndexerConfig;
  19. import gate.mimir.MimirIndex;
  20. import gate.mimir.SemanticAnnotationHelper;
  21. import gate.mimir.index.AtomicIndex;
  22. import gate.mimir.index.DocumentCollection;
  23. import gate.mimir.index.IndexException;
  24. import it.unimi.di.big.mg4j.index.DiskBasedIndex;
  25. import it.unimi.di.big.mg4j.index.cluster.DocumentalCluster;
  26. import it.unimi.di.big.mg4j.io.IOFactory;
  27. import it.unimi.di.big.mg4j.io.IOFactories;

  28. import it.unimi.dsi.util.Properties;


  29. import java.io.File;
  30. import java.io.FilenameFilter;
  31. import java.io.IOException;
  32. import java.nio.file.Files;
  33. import java.nio.file.StandardCopyOption;
  34. import java.util.Arrays;
  35. import java.util.HashSet;
  36. import java.util.Set;

  37. import org.slf4j.Logger;
  38. import org.slf4j.LoggerFactory;

  39. /**
  40.  * Implementation of an algorithm to upgrade a 4.x or 5.0 Mímir
  41.  * index to the format used by the current version.
  42.  */
  43. public class IndexUpgrader {
  44.  
  45.   protected static Logger logger = LoggerFactory.getLogger(IndexUpgrader.class);
  46.  
  47.   /**
  48.    * A minimal set of files required for a valid index.
  49.    */
  50.   protected static final String[] REQUIRED_INDEX_FILE_EXTENSIONS = new String[] {
  51.     DiskBasedIndex.INDEX_EXTENSION,
  52.     DiskBasedIndex.POSITIONS_EXTENSION,
  53.     DiskBasedIndex.TERMS_EXTENSION,
  54.     DiskBasedIndex.OFFSETS_EXTENSION
  55.   };

  56.   /**
  57.    * A minimal set of files required for a valid quasi-succinct index.
  58.    */
  59.   protected static final String[] REQUIRED_QS_INDEX_FILE_EXTENSIONS = new String[] {
  60.     DiskBasedIndex.COUNTS_EXTENSION,
  61.     DiskBasedIndex.COUNTS_EXTENSION + "offsets",
  62.     DiskBasedIndex.POINTERS_EXTENSIONS,
  63.     DiskBasedIndex.POINTERS_EXTENSIONS + "offsets",
  64.     DiskBasedIndex.POSITIONS_EXTENSION,
  65.     DiskBasedIndex.POSITIONS_EXTENSION + "offsets",
  66.     DiskBasedIndex.TERMS_EXTENSION,
  67.   };
  68.    
  69.   /**
  70.    * A minimal set of files required for a valid direct index.
  71.    */
  72.   protected static final String[] REQUIRED_DIRECT_INDEX_FILE_EXTENSIONS = new String[] {
  73.     DiskBasedIndex.INDEX_EXTENSION,
  74.     DiskBasedIndex.TERMS_EXTENSION,
  75.     DiskBasedIndex.OFFSETS_EXTENSION
  76.   };

  77.   /**
  78.    * A minimal set of files required for a valid quasi-succinct direct index.
  79.    */
  80.   protected static final String[] REQUIRED_QS_DIRECT_INDEX_FILE_EXTENSIONS = new String[] {
  81.     DiskBasedIndex.COUNTS_EXTENSION,
  82.     DiskBasedIndex.COUNTS_EXTENSION + "offsets",
  83.     DiskBasedIndex.POINTERS_EXTENSIONS,
  84.     DiskBasedIndex.POINTERS_EXTENSIONS + "offsets",
  85.     DiskBasedIndex.TERMS_EXTENSION,
  86.   };
  87.  
  88.   public static void upgradeIndex(File indexDirectory) throws IOException,
  89.       IndexException {
  90.     File indexConfigFile = new File(indexDirectory,
  91.         MimirIndex.INDEX_CONFIG_FILENAME);
  92.     IndexConfig indexConfig = IndexConfig.readConfigFromFile(indexConfigFile);
  93.     //test the version
  94.     if(indexConfig.getFormatVersion() == 7) {
  95.       upgradeFromV7(indexDirectory);
  96.     } else {
  97.       if(indexConfig.getFormatVersion() > 6 || indexConfig.getFormatVersion() < 4){
  98.         throw new IndexException(
  99.             "Unsupported index version: " + indexConfig.getFormatVersion());
  100.       }
  101.      
  102.       //check that none of the files to be created exist already
  103.       for(int i = 0 ; i < indexConfig.getTokenIndexers().length; i++) {
  104.         File tokenDir = new File(indexDirectory, "token-" + i);
  105.         if(tokenDir.exists()) {
  106.           throw new IndexException(
  107.               "Location required by upgraded index already exists:" +
  108.               tokenDir.getAbsolutePath());
  109.         }
  110.       }
  111.       for(int i = 0 ; i < indexConfig.getSemanticIndexers().length; i++) {
  112.         File tokenDir = new File(indexDirectory, "mention-" + i);
  113.         if(tokenDir.exists()) {
  114.           throw new IndexException(
  115.               "Location required by upgraded index already exists:" +
  116.               tokenDir.getAbsolutePath());
  117.         }
  118.       }
  119.      
  120.       // check access
  121.       File sourceDir = new File(indexDirectory, "mg4j");
  122.       if(!sourceDir.isDirectory()) throw new IndexException(
  123.           "Invalid index: could not find source directory at" +
  124.           sourceDir.getAbsolutePath());
  125.       if(!sourceDir.canRead()) throw new IndexException(
  126.           "Could not read source directory at" + sourceDir.getAbsolutePath());
  127.       // check that we know how to deal with the S-A-H implementations
  128.       Class<? extends SemanticAnnotationHelper> dbSahClass = null;
  129.       try {
  130.         dbSahClass = Class.forName(
  131.             "gate.mimir.db.DBSemanticAnnotationHelper",
  132.             true, Gate.getClassLoader()).asSubclass(
  133.                 SemanticAnnotationHelper.class);
  134.       } catch(ClassNotFoundException e) {
  135.         throw new IndexException("Could not find the DB S-A-H class. "
  136.             + "Is the 'db-h2' plugin loaded?", e);
  137.       }
  138.       for(int subIndexIdx = 0 ;
  139.           subIndexIdx < indexConfig.getSemanticIndexers().length;
  140.           subIndexIdx++) {
  141.         SemanticIndexerConfig sic = indexConfig.getSemanticIndexers()[subIndexIdx];
  142.         for(SemanticAnnotationHelper sah : sic.getHelpers()) {
  143.           while(sah instanceof DelegatingSemanticAnnotationHelper) {
  144.             sah = ((DelegatingSemanticAnnotationHelper)sah).getDelegate();
  145.           }
  146.           if(!dbSahClass.isAssignableFrom(sah.getClass())) {
  147.             throw new IndexException("Cannot convert mentions index mentions-" +
  148.                 subIndexIdx + " because it does not use the DB H2 " +
  149.                 "Annotation Helper, which is the only one supported by " +
  150.                 "this automatic upgrade process");
  151.           }
  152.         }
  153.       }
  154.       // move files
  155.       //collection files
  156.       File[] collectionFiles = sourceDir.listFiles(
  157.           DocumentCollection.CollectionFile.FILENAME_FILTER);
  158.       for(File aColFile : collectionFiles) {
  159.         File dest = new File(indexDirectory, aColFile.getName());
  160.         if(! aColFile.renameTo(dest)) {
  161.           throw new IndexException("Could not rename " +
  162.               aColFile.getAbsolutePath() + " to " + dest.getAbsolutePath());
  163.         }
  164.       }
  165.       //token indexes
  166.       for(int subIndexIdx = 0 ;
  167.           subIndexIdx < indexConfig.getTokenIndexers().length;
  168.           subIndexIdx++) {
  169.         upgradeSubIndex(indexDirectory, subIndexIdx,
  170.             indexConfig.getTokenIndexers()[subIndexIdx].isDirectIndexEnabled(),
  171.             null);
  172.       }
  173.       // mention indexes
  174.       for(int subIndexIdx = 0 ;
  175.           subIndexIdx < indexConfig.getSemanticIndexers().length;
  176.           subIndexIdx++) {
  177.         SemanticIndexerConfig sic = indexConfig.getSemanticIndexers()[subIndexIdx];
  178.         upgradeSubIndex(indexDirectory, subIndexIdx, sic.isDirectIndexEnabled(),
  179.             sic);
  180.       }
  181.       // cleanup old dirs (only if empty)
  182.       if(sourceDir.listFiles().length == 0) {
  183.         if(!sourceDir.delete()) {
  184.           logger.info("Could not delete old MG4J directory " + sourceDir +
  185.               " even though it appears empty.");
  186.         }
  187.       }
  188.       File sourceDBDir = new File(indexDirectory, "db");
  189.       if(sourceDBDir.listFiles().length == 0) {
  190.         if(!sourceDBDir.delete()) {
  191.           logger.info("Could not delete old DB directory " + sourceDBDir +
  192.               " even though it appears empty.");
  193.         }
  194.       }
  195.     }
  196.     //update the version number in the index config
  197.     indexConfig.setFormatVersion(IndexConfig.FORMAT_VERSION);
  198.     IndexConfig.writeConfigToFile(indexConfig, indexConfigFile);
  199.   }
  200.  
  201.   /**
  202.    * Moves the file belonging to one sub-index.
  203.    * @param indexDirectory the top level index directory for the M&iacute;mir
  204.    * index being upgraded.
  205.    * @param subIndexIdx the index (position) of the sub-index
  206.    * @param mentionsConfig if this is a mentions index, then this parameter
  207.    *  contains the mentions indexer config, null otherwise.
  208.    * @param direct doe this sub-index have a direct index also?
  209.    * @throws IndexException
  210.    * @throws IOException
  211.    */
  212.   protected static void upgradeSubIndex(File indexDirectory, int subIndexIdx,
  213.         final boolean direct, SemanticIndexerConfig mentionsConfig) throws IndexException, IOException {
  214.     File sourceDir = new File(indexDirectory, "mg4j");
  215.     // sanity checks
  216.     final String inputFilePrefix =
  217.         (mentionsConfig != null ? "mimir-mentions-" : "mimir-token-") +
  218.         subIndexIdx;
  219.    
  220.     File[] atomicIndexFiles = sourceDir.listFiles(new FilenameFilter() {
  221.       @Override
  222.       public boolean accept(File dir, String name) {
  223.         return name.startsWith(inputFilePrefix + ".") ||
  224.             (direct && name.startsWith(inputFilePrefix +
  225.              AtomicIndex.DIRECT_INDEX_NAME_SUFFIX + "."));
  226.       }
  227.     });

  228.     Set<String> requiredExtensions = new HashSet<String>();
  229.     Set<String> requiredDirectExtensions = new HashSet<String>();
  230.    
  231.     String indexPropertiesFile = new File(sourceDir, inputFilePrefix + DiskBasedIndex.PROPERTIES_EXTENSION).getAbsolutePath();
  232.     try {
  233.       Properties indexProperties = IOFactories.loadProperties(IOFactory.FILESYSTEM_FACTORY,
  234.           indexPropertiesFile);
  235.       if("it.unimi.di.big.mg4j.index.QuasiSuccinctIndex".equals(indexProperties.getString("indexclass"))) {
  236.         requiredExtensions.addAll(Arrays.asList(REQUIRED_QS_INDEX_FILE_EXTENSIONS));
  237.       } else {
  238.         requiredExtensions.addAll(Arrays.asList(REQUIRED_INDEX_FILE_EXTENSIONS));
  239.       }
  240.     } catch(Exception e) {
  241.       throw new IndexException("Error reading " + indexPropertiesFile, e);
  242.     }

  243.     if(direct) {
  244.       String directPropertiesFile = new File(sourceDir, inputFilePrefix + AtomicIndex.DIRECT_INDEX_NAME_SUFFIX
  245.           + DiskBasedIndex.PROPERTIES_EXTENSION).getAbsolutePath();
  246.       try {
  247.         Properties directProperties = IOFactories.loadProperties(IOFactory.FILESYSTEM_FACTORY,
  248.             directPropertiesFile);
  249.         if("it.unimi.di.big.mg4j.index.QuasiSuccinctIndex".equals(directProperties.getString("indexclass"))) {
  250.           requiredDirectExtensions.addAll(Arrays.asList(REQUIRED_QS_DIRECT_INDEX_FILE_EXTENSIONS));
  251.         } else {
  252.           requiredDirectExtensions.addAll(Arrays.asList(REQUIRED_DIRECT_INDEX_FILE_EXTENSIONS));
  253.         }
  254.       } catch(Exception e) {
  255.         throw new IndexException("Error reading " + directPropertiesFile, e);
  256.       }
  257.     }

  258.     for(File aFile : atomicIndexFiles) {
  259.       String extension = aFile.getName().substring(inputFilePrefix.length());
  260.       if(direct && extension.startsWith(AtomicIndex.DIRECT_INDEX_NAME_SUFFIX)) {
  261.         extension = extension.substring(AtomicIndex.DIRECT_INDEX_NAME_SUFFIX.length());
  262.         requiredDirectExtensions.remove(extension);
  263.       } else {
  264.         requiredExtensions.remove(extension);  
  265.       }
  266.     }
  267.     // check that we've seen all files we wanted
  268.     if(!requiredExtensions.isEmpty() ||
  269.         (direct && ! requiredDirectExtensions.isEmpty())) {
  270.       //not all required files were found
  271.       StringBuilder str = new StringBuilder(
  272.           "Some required files were not found for index '");
  273.       str.append(inputFilePrefix).append("': ");
  274.       for(String extension : requiredExtensions) {
  275.         str.append(new File(sourceDir,
  276.             inputFilePrefix + extension).getAbsolutePath());
  277.         str.append("\n");
  278.       }
  279.       if(direct) {
  280.         for(String extension : requiredDirectExtensions) {
  281.           str.append(new File(sourceDir,
  282.               inputFilePrefix + extension).getAbsolutePath());
  283.           str.append("\n");
  284.         }
  285.       }
  286.       throw new IndexException(str.toString());
  287.     }
  288.    
  289.     // all tests passed - start creating the new directories
  290.     String outputFilePrefix = (mentionsConfig != null ? "mention-" : "token-") +
  291.         subIndexIdx;
  292.     File atomicIndexDir = new File(indexDirectory, outputFilePrefix);
  293.     File headDir = new File(atomicIndexDir, AtomicIndex.HEAD_FILE_NAME);
  294.     if(!headDir.mkdirs()) {
  295.       throw new IndexException(
  296.           "Location required by upgraded index could not be created:" +
  297.           headDir.getAbsolutePath());
  298.     }
  299.     for(File sourceFile : atomicIndexFiles) {
  300.       String extension = sourceFile.getName().substring(inputFilePrefix.length());
  301.       File destinationFile = new File(headDir, outputFilePrefix + extension);
  302.       if(!sourceFile.renameTo(destinationFile)) {
  303.         throw new IndexException("Could not rename " +
  304.             sourceFile.getAbsolutePath() + " to " +
  305.             destinationFile.getAbsolutePath());
  306.       }
  307.     }
  308.     // create Bloom filter, and regenerate the term map
  309.     File termsFile = new File(headDir, outputFilePrefix +
  310.         DiskBasedIndex.TERMS_EXTENSION); // guaranteed to exist, as tested already
  311.     File termMapFile = new File(headDir, outputFilePrefix +
  312.         DiskBasedIndex.TERMMAP_EXTENSION); // may not exist but that's OK
  313.     File bloomFile = new File(headDir, outputFilePrefix +
  314.         DocumentalCluster.BLOOM_EXTENSION); // guaranteed to exist, as tested already
  315.     if(termMapFile.exists()) {
  316.       if(!termMapFile.renameTo(new File(headDir, outputFilePrefix +
  317.           DiskBasedIndex.TERMMAP_EXTENSION + ".old"))) {
  318.         logger.warn("Unable to back up old termmap for " + outputFilePrefix + "/" + headDir.getName());
  319.       }
  320.     }
  321.     AtomicIndex.generateTermMap(termsFile, termMapFile, bloomFile);
  322.    
  323.     if(direct) {
  324.       // create the direct.terms file by copying the terms file from
  325.       // the **inverted** index in head
  326.       File dest = new File(atomicIndexDir, AtomicIndex.DIRECT_TERMS_FILENAME);
  327.       Files.copy(termsFile.toPath(), dest.toPath(),
  328.           StandardCopyOption.COPY_ATTRIBUTES);
  329.       // create direct Bloom filter and regenerate term map
  330.       File dirTermsFile = new File(headDir, outputFilePrefix +
  331.           AtomicIndex.DIRECT_INDEX_NAME_SUFFIX +
  332.           DiskBasedIndex.TERMS_EXTENSION); // guaranteed to exist, as tested already
  333.       File dirTermMapFile = new File(headDir, outputFilePrefix +
  334.           AtomicIndex.DIRECT_INDEX_NAME_SUFFIX +
  335.           DiskBasedIndex.TERMMAP_EXTENSION); // may not exist but that's OK
  336.       File dirBloomFile = new File(headDir, outputFilePrefix +
  337.           AtomicIndex.DIRECT_INDEX_NAME_SUFFIX +
  338.           DocumentalCluster.BLOOM_EXTENSION); // guaranteed to exist, as tested already
  339.       if(dirTermMapFile.exists()) {
  340.         if(!dirTermMapFile.renameTo(new File(headDir, outputFilePrefix +
  341.             AtomicIndex.DIRECT_INDEX_NAME_SUFFIX +
  342.             DiskBasedIndex.TERMMAP_EXTENSION + ".old"))) {
  343.           logger.warn("Unable to back up old direct termmap for " + outputFilePrefix + "/" + headDir.getName());
  344.         }
  345.       }
  346.       AtomicIndex.generateTermMap(dirTermsFile, dirTermMapFile, dirBloomFile);
  347.     }
  348.    
  349.     // move the DB files
  350.     if(mentionsConfig != null) {
  351.       // We know that the DB-H2 S-A-H was used, as we've already tested for that
  352.       File sourceDBDir = new File(indexDirectory, "db");
  353.       File destDBDir = new File(atomicIndexDir, "db");
  354.       if(!destDBDir.mkdirs()) {
  355.         throw new IndexException(
  356.             "Location required by upgraded index could not be created:" +
  357.             destDBDir.getAbsolutePath());
  358.       }
  359.       for(String annType : mentionsConfig.getAnnotationTypes()) {
  360.         String tableBaseName = annType.replaceAll("[^\\p{Alnum}_]", "_");
  361.         File source = new File(sourceDBDir, tableBaseName + ".h2.db");
  362.         File dest = new File(destDBDir, tableBaseName + ".h2.db");
  363.         if(!source.renameTo(dest)) {
  364.           throw new IndexException("Could not rename " +  
  365.               source.getAbsolutePath() + " to " + dest.getAbsolutePath());
  366.         }
  367.       }
  368.     }
  369.   }
  370.  
  371.   protected static void upgradeFromV7(File indexDirectory) throws IndexException, IOException {
  372.     // just need to regenerate the term maps
  373.     for(File subIndexDirectory : indexDirectory.listFiles((File parent, String name) -> name.matches("(?:token|mention)-\\d+"))) {
  374.       String outputFilePrefix = subIndexDirectory.getName();
  375.       for(File batchDir : subIndexDirectory.listFiles((File parent, String name) -> name.matches("head|tail-\\d+"))) {
  376.         // create termmap and Bloom filter
  377.         File termsFile = new File(batchDir, outputFilePrefix +
  378.             DiskBasedIndex.TERMS_EXTENSION);
  379.         File termMapFile = new File(batchDir, outputFilePrefix +
  380.             DiskBasedIndex.TERMMAP_EXTENSION);
  381.         File bloomFile = new File(batchDir, outputFilePrefix +
  382.             DocumentalCluster.BLOOM_EXTENSION);
  383.         if(!termsFile.exists()) {
  384.           throw new IndexException("No terms file found for " + outputFilePrefix + "/" + batchDir.getName());
  385.         }
  386.         if(termMapFile.exists()) {
  387.           if(!termMapFile.renameTo(new File(batchDir, outputFilePrefix +
  388.               DiskBasedIndex.TERMMAP_EXTENSION + ".old"))) {
  389.             logger.warn("Unable to back up old termmap for " + outputFilePrefix + "/" + batchDir.getName());
  390.           }
  391.         }
  392.         if(bloomFile.exists()) {
  393.           if(!bloomFile.renameTo(new File(batchDir, outputFilePrefix +
  394.               DocumentalCluster.BLOOM_EXTENSION + ".old"))) {
  395.             logger.warn("Unable to back up old bloom filter for " + outputFilePrefix + "/" + batchDir.getName());
  396.           }
  397.         }
  398.         AtomicIndex.generateTermMap(termsFile, termMapFile, bloomFile);
  399.        
  400.         // create direct termmap and Bloom filter
  401.         File dirTermsFile = new File(batchDir, outputFilePrefix +
  402.             AtomicIndex.DIRECT_INDEX_NAME_SUFFIX +
  403.             DiskBasedIndex.TERMS_EXTENSION);
  404.         if(dirTermsFile.exists()) {
  405.           File dirTermMapFile = new File(batchDir, outputFilePrefix +
  406.               AtomicIndex.DIRECT_INDEX_NAME_SUFFIX +
  407.               DiskBasedIndex.TERMMAP_EXTENSION);
  408.           File dirBloomFile = new File(batchDir, outputFilePrefix +
  409.               AtomicIndex.DIRECT_INDEX_NAME_SUFFIX +
  410.               DocumentalCluster.BLOOM_EXTENSION);
  411.           if(dirTermMapFile.exists()) {
  412.             if(!dirTermMapFile.renameTo(new File(batchDir, outputFilePrefix +
  413.                 AtomicIndex.DIRECT_INDEX_NAME_SUFFIX +
  414.                 DiskBasedIndex.TERMMAP_EXTENSION + ".old"))) {
  415.               logger.warn("Unable to back up old direct termmap for " + outputFilePrefix + "/" + batchDir.getName());
  416.             }
  417.           }
  418.           if(dirBloomFile.exists()) {
  419.             if(!dirBloomFile.renameTo(new File(batchDir, outputFilePrefix +
  420.                 AtomicIndex.DIRECT_INDEX_NAME_SUFFIX +
  421.                 DocumentalCluster.BLOOM_EXTENSION + ".old"))) {
  422.               logger.warn("Unable to back up old direct bloom filter for " + outputFilePrefix + "/" + batchDir.getName());
  423.             }
  424.           }

  425.           AtomicIndex.generateTermMap(dirTermsFile, dirTermMapFile, dirBloomFile);
  426.         }

  427.       }
  428.     }
  429.   }

  430. }