IndexUpgrader.java
/*
* IndexUpgrader.java
*
* Copyright (c) 2007-2014, The University of Sheffield.
*
* This file is part of GATE MÃmir (see http://gate.ac.uk/family/mimir.html),
* and is free software, licenced under the GNU Lesser General Public License,
* Version 3, June 2007 (also included with this distribution as file
* LICENCE-LGPL3.html).
*
* Valentin Tablan, 27 Feb 2014
*
* $Id: IndexUpgrader.java 20281 2017-12-05 00:35:07Z ian_roberts $
*/
package gate.mimir.util;
import gate.Gate;
import gate.mimir.IndexConfig;
import gate.mimir.IndexConfig.SemanticIndexerConfig;
import gate.mimir.MimirIndex;
import gate.mimir.SemanticAnnotationHelper;
import gate.mimir.index.AtomicIndex;
import gate.mimir.index.DocumentCollection;
import gate.mimir.index.IndexException;
import it.unimi.di.big.mg4j.index.DiskBasedIndex;
import it.unimi.di.big.mg4j.index.cluster.DocumentalCluster;
import it.unimi.di.big.mg4j.io.IOFactory;
import it.unimi.di.big.mg4j.io.IOFactories;
import it.unimi.dsi.util.Properties;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Implementation of an algorithm to upgrade a 4.x or 5.0 Mímir
* index to the format used by the current version.
*/
public class IndexUpgrader {
protected static Logger logger = LoggerFactory.getLogger(IndexUpgrader.class);
/**
* A minimal set of files required for a valid index.
*/
protected static final String[] REQUIRED_INDEX_FILE_EXTENSIONS = new String[] {
DiskBasedIndex.INDEX_EXTENSION,
DiskBasedIndex.POSITIONS_EXTENSION,
DiskBasedIndex.TERMS_EXTENSION,
DiskBasedIndex.OFFSETS_EXTENSION
};
/**
* A minimal set of files required for a valid quasi-succinct index.
*/
protected static final String[] REQUIRED_QS_INDEX_FILE_EXTENSIONS = new String[] {
DiskBasedIndex.COUNTS_EXTENSION,
DiskBasedIndex.COUNTS_EXTENSION + "offsets",
DiskBasedIndex.POINTERS_EXTENSIONS,
DiskBasedIndex.POINTERS_EXTENSIONS + "offsets",
DiskBasedIndex.POSITIONS_EXTENSION,
DiskBasedIndex.POSITIONS_EXTENSION + "offsets",
DiskBasedIndex.TERMS_EXTENSION,
};
/**
* A minimal set of files required for a valid direct index.
*/
protected static final String[] REQUIRED_DIRECT_INDEX_FILE_EXTENSIONS = new String[] {
DiskBasedIndex.INDEX_EXTENSION,
DiskBasedIndex.TERMS_EXTENSION,
DiskBasedIndex.OFFSETS_EXTENSION
};
/**
* A minimal set of files required for a valid quasi-succinct direct index.
*/
protected static final String[] REQUIRED_QS_DIRECT_INDEX_FILE_EXTENSIONS = new String[] {
DiskBasedIndex.COUNTS_EXTENSION,
DiskBasedIndex.COUNTS_EXTENSION + "offsets",
DiskBasedIndex.POINTERS_EXTENSIONS,
DiskBasedIndex.POINTERS_EXTENSIONS + "offsets",
DiskBasedIndex.TERMS_EXTENSION,
};
public static void upgradeIndex(File indexDirectory) throws IOException,
IndexException {
File indexConfigFile = new File(indexDirectory,
MimirIndex.INDEX_CONFIG_FILENAME);
IndexConfig indexConfig = IndexConfig.readConfigFromFile(indexConfigFile);
//test the version
if(indexConfig.getFormatVersion() == 7) {
upgradeFromV7(indexDirectory);
} else {
if(indexConfig.getFormatVersion() > 6 || indexConfig.getFormatVersion() < 4){
throw new IndexException(
"Unsupported index version: " + indexConfig.getFormatVersion());
}
//check that none of the files to be created exist already
for(int i = 0 ; i < indexConfig.getTokenIndexers().length; i++) {
File tokenDir = new File(indexDirectory, "token-" + i);
if(tokenDir.exists()) {
throw new IndexException(
"Location required by upgraded index already exists:" +
tokenDir.getAbsolutePath());
}
}
for(int i = 0 ; i < indexConfig.getSemanticIndexers().length; i++) {
File tokenDir = new File(indexDirectory, "mention-" + i);
if(tokenDir.exists()) {
throw new IndexException(
"Location required by upgraded index already exists:" +
tokenDir.getAbsolutePath());
}
}
// check access
File sourceDir = new File(indexDirectory, "mg4j");
if(!sourceDir.isDirectory()) throw new IndexException(
"Invalid index: could not find source directory at" +
sourceDir.getAbsolutePath());
if(!sourceDir.canRead()) throw new IndexException(
"Could not read source directory at" + sourceDir.getAbsolutePath());
// check that we know how to deal with the S-A-H implementations
Class<? extends SemanticAnnotationHelper> dbSahClass = null;
try {
dbSahClass = Class.forName(
"gate.mimir.db.DBSemanticAnnotationHelper",
true, Gate.getClassLoader()).asSubclass(
SemanticAnnotationHelper.class);
} catch(ClassNotFoundException e) {
throw new IndexException("Could not find the DB S-A-H class. "
+ "Is the 'db-h2' plugin loaded?", e);
}
for(int subIndexIdx = 0 ;
subIndexIdx < indexConfig.getSemanticIndexers().length;
subIndexIdx++) {
SemanticIndexerConfig sic = indexConfig.getSemanticIndexers()[subIndexIdx];
for(SemanticAnnotationHelper sah : sic.getHelpers()) {
while(sah instanceof DelegatingSemanticAnnotationHelper) {
sah = ((DelegatingSemanticAnnotationHelper)sah).getDelegate();
}
if(!dbSahClass.isAssignableFrom(sah.getClass())) {
throw new IndexException("Cannot convert mentions index mentions-" +
subIndexIdx + " because it does not use the DB H2 " +
"Annotation Helper, which is the only one supported by " +
"this automatic upgrade process");
}
}
}
// move files
//collection files
File[] collectionFiles = sourceDir.listFiles(
DocumentCollection.CollectionFile.FILENAME_FILTER);
for(File aColFile : collectionFiles) {
File dest = new File(indexDirectory, aColFile.getName());
if(! aColFile.renameTo(dest)) {
throw new IndexException("Could not rename " +
aColFile.getAbsolutePath() + " to " + dest.getAbsolutePath());
}
}
//token indexes
for(int subIndexIdx = 0 ;
subIndexIdx < indexConfig.getTokenIndexers().length;
subIndexIdx++) {
upgradeSubIndex(indexDirectory, subIndexIdx,
indexConfig.getTokenIndexers()[subIndexIdx].isDirectIndexEnabled(),
null);
}
// mention indexes
for(int subIndexIdx = 0 ;
subIndexIdx < indexConfig.getSemanticIndexers().length;
subIndexIdx++) {
SemanticIndexerConfig sic = indexConfig.getSemanticIndexers()[subIndexIdx];
upgradeSubIndex(indexDirectory, subIndexIdx, sic.isDirectIndexEnabled(),
sic);
}
// cleanup old dirs (only if empty)
if(sourceDir.listFiles().length == 0) {
if(!sourceDir.delete()) {
logger.info("Could not delete old MG4J directory " + sourceDir +
" even though it appears empty.");
}
}
File sourceDBDir = new File(indexDirectory, "db");
if(sourceDBDir.listFiles().length == 0) {
if(!sourceDBDir.delete()) {
logger.info("Could not delete old DB directory " + sourceDBDir +
" even though it appears empty.");
}
}
}
//update the version number in the index config
indexConfig.setFormatVersion(IndexConfig.FORMAT_VERSION);
IndexConfig.writeConfigToFile(indexConfig, indexConfigFile);
}
/**
* Moves the file belonging to one sub-index.
* @param indexDirectory the top level index directory for the Mímir
* index being upgraded.
* @param subIndexIdx the index (position) of the sub-index
* @param mentionsConfig if this is a mentions index, then this parameter
* contains the mentions indexer config, null otherwise.
* @param direct doe this sub-index have a direct index also?
* @throws IndexException
* @throws IOException
*/
protected static void upgradeSubIndex(File indexDirectory, int subIndexIdx,
final boolean direct, SemanticIndexerConfig mentionsConfig) throws IndexException, IOException {
File sourceDir = new File(indexDirectory, "mg4j");
// sanity checks
final String inputFilePrefix =
(mentionsConfig != null ? "mimir-mentions-" : "mimir-token-") +
subIndexIdx;
File[] atomicIndexFiles = sourceDir.listFiles(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.startsWith(inputFilePrefix + ".") ||
(direct && name.startsWith(inputFilePrefix +
AtomicIndex.DIRECT_INDEX_NAME_SUFFIX + "."));
}
});
Set<String> requiredExtensions = new HashSet<String>();
Set<String> requiredDirectExtensions = new HashSet<String>();
String indexPropertiesFile = new File(sourceDir, inputFilePrefix + DiskBasedIndex.PROPERTIES_EXTENSION).getAbsolutePath();
try {
Properties indexProperties = IOFactories.loadProperties(IOFactory.FILESYSTEM_FACTORY,
indexPropertiesFile);
if("it.unimi.di.big.mg4j.index.QuasiSuccinctIndex".equals(indexProperties.getString("indexclass"))) {
requiredExtensions.addAll(Arrays.asList(REQUIRED_QS_INDEX_FILE_EXTENSIONS));
} else {
requiredExtensions.addAll(Arrays.asList(REQUIRED_INDEX_FILE_EXTENSIONS));
}
} catch(Exception e) {
throw new IndexException("Error reading " + indexPropertiesFile, e);
}
if(direct) {
String directPropertiesFile = new File(sourceDir, inputFilePrefix + AtomicIndex.DIRECT_INDEX_NAME_SUFFIX
+ DiskBasedIndex.PROPERTIES_EXTENSION).getAbsolutePath();
try {
Properties directProperties = IOFactories.loadProperties(IOFactory.FILESYSTEM_FACTORY,
directPropertiesFile);
if("it.unimi.di.big.mg4j.index.QuasiSuccinctIndex".equals(directProperties.getString("indexclass"))) {
requiredDirectExtensions.addAll(Arrays.asList(REQUIRED_QS_DIRECT_INDEX_FILE_EXTENSIONS));
} else {
requiredDirectExtensions.addAll(Arrays.asList(REQUIRED_DIRECT_INDEX_FILE_EXTENSIONS));
}
} catch(Exception e) {
throw new IndexException("Error reading " + directPropertiesFile, e);
}
}
for(File aFile : atomicIndexFiles) {
String extension = aFile.getName().substring(inputFilePrefix.length());
if(direct && extension.startsWith(AtomicIndex.DIRECT_INDEX_NAME_SUFFIX)) {
extension = extension.substring(AtomicIndex.DIRECT_INDEX_NAME_SUFFIX.length());
requiredDirectExtensions.remove(extension);
} else {
requiredExtensions.remove(extension);
}
}
// check that we've seen all files we wanted
if(!requiredExtensions.isEmpty() ||
(direct && ! requiredDirectExtensions.isEmpty())) {
//not all required files were found
StringBuilder str = new StringBuilder(
"Some required files were not found for index '");
str.append(inputFilePrefix).append("': ");
for(String extension : requiredExtensions) {
str.append(new File(sourceDir,
inputFilePrefix + extension).getAbsolutePath());
str.append("\n");
}
if(direct) {
for(String extension : requiredDirectExtensions) {
str.append(new File(sourceDir,
inputFilePrefix + extension).getAbsolutePath());
str.append("\n");
}
}
throw new IndexException(str.toString());
}
// all tests passed - start creating the new directories
String outputFilePrefix = (mentionsConfig != null ? "mention-" : "token-") +
subIndexIdx;
File atomicIndexDir = new File(indexDirectory, outputFilePrefix);
File headDir = new File(atomicIndexDir, AtomicIndex.HEAD_FILE_NAME);
if(!headDir.mkdirs()) {
throw new IndexException(
"Location required by upgraded index could not be created:" +
headDir.getAbsolutePath());
}
for(File sourceFile : atomicIndexFiles) {
String extension = sourceFile.getName().substring(inputFilePrefix.length());
File destinationFile = new File(headDir, outputFilePrefix + extension);
if(!sourceFile.renameTo(destinationFile)) {
throw new IndexException("Could not rename " +
sourceFile.getAbsolutePath() + " to " +
destinationFile.getAbsolutePath());
}
}
// create Bloom filter, and regenerate the term map
File termsFile = new File(headDir, outputFilePrefix +
DiskBasedIndex.TERMS_EXTENSION); // guaranteed to exist, as tested already
File termMapFile = new File(headDir, outputFilePrefix +
DiskBasedIndex.TERMMAP_EXTENSION); // may not exist but that's OK
File bloomFile = new File(headDir, outputFilePrefix +
DocumentalCluster.BLOOM_EXTENSION); // guaranteed to exist, as tested already
if(termMapFile.exists()) {
if(!termMapFile.renameTo(new File(headDir, outputFilePrefix +
DiskBasedIndex.TERMMAP_EXTENSION + ".old"))) {
logger.warn("Unable to back up old termmap for " + outputFilePrefix + "/" + headDir.getName());
}
}
AtomicIndex.generateTermMap(termsFile, termMapFile, bloomFile);
if(direct) {
// create the direct.terms file by copying the terms file from
// the **inverted** index in head
File dest = new File(atomicIndexDir, AtomicIndex.DIRECT_TERMS_FILENAME);
Files.copy(termsFile.toPath(), dest.toPath(),
StandardCopyOption.COPY_ATTRIBUTES);
// create direct Bloom filter and regenerate term map
File dirTermsFile = new File(headDir, outputFilePrefix +
AtomicIndex.DIRECT_INDEX_NAME_SUFFIX +
DiskBasedIndex.TERMS_EXTENSION); // guaranteed to exist, as tested already
File dirTermMapFile = new File(headDir, outputFilePrefix +
AtomicIndex.DIRECT_INDEX_NAME_SUFFIX +
DiskBasedIndex.TERMMAP_EXTENSION); // may not exist but that's OK
File dirBloomFile = new File(headDir, outputFilePrefix +
AtomicIndex.DIRECT_INDEX_NAME_SUFFIX +
DocumentalCluster.BLOOM_EXTENSION); // guaranteed to exist, as tested already
if(dirTermMapFile.exists()) {
if(!dirTermMapFile.renameTo(new File(headDir, outputFilePrefix +
AtomicIndex.DIRECT_INDEX_NAME_SUFFIX +
DiskBasedIndex.TERMMAP_EXTENSION + ".old"))) {
logger.warn("Unable to back up old direct termmap for " + outputFilePrefix + "/" + headDir.getName());
}
}
AtomicIndex.generateTermMap(dirTermsFile, dirTermMapFile, dirBloomFile);
}
// move the DB files
if(mentionsConfig != null) {
// We know that the DB-H2 S-A-H was used, as we've already tested for that
File sourceDBDir = new File(indexDirectory, "db");
File destDBDir = new File(atomicIndexDir, "db");
if(!destDBDir.mkdirs()) {
throw new IndexException(
"Location required by upgraded index could not be created:" +
destDBDir.getAbsolutePath());
}
for(String annType : mentionsConfig.getAnnotationTypes()) {
String tableBaseName = annType.replaceAll("[^\\p{Alnum}_]", "_");
File source = new File(sourceDBDir, tableBaseName + ".h2.db");
File dest = new File(destDBDir, tableBaseName + ".h2.db");
if(!source.renameTo(dest)) {
throw new IndexException("Could not rename " +
source.getAbsolutePath() + " to " + dest.getAbsolutePath());
}
}
}
}
protected static void upgradeFromV7(File indexDirectory) throws IndexException, IOException {
// just need to regenerate the term maps
for(File subIndexDirectory : indexDirectory.listFiles((File parent, String name) -> name.matches("(?:token|mention)-\\d+"))) {
String outputFilePrefix = subIndexDirectory.getName();
for(File batchDir : subIndexDirectory.listFiles((File parent, String name) -> name.matches("head|tail-\\d+"))) {
// create termmap and Bloom filter
File termsFile = new File(batchDir, outputFilePrefix +
DiskBasedIndex.TERMS_EXTENSION);
File termMapFile = new File(batchDir, outputFilePrefix +
DiskBasedIndex.TERMMAP_EXTENSION);
File bloomFile = new File(batchDir, outputFilePrefix +
DocumentalCluster.BLOOM_EXTENSION);
if(!termsFile.exists()) {
throw new IndexException("No terms file found for " + outputFilePrefix + "/" + batchDir.getName());
}
if(termMapFile.exists()) {
if(!termMapFile.renameTo(new File(batchDir, outputFilePrefix +
DiskBasedIndex.TERMMAP_EXTENSION + ".old"))) {
logger.warn("Unable to back up old termmap for " + outputFilePrefix + "/" + batchDir.getName());
}
}
if(bloomFile.exists()) {
if(!bloomFile.renameTo(new File(batchDir, outputFilePrefix +
DocumentalCluster.BLOOM_EXTENSION + ".old"))) {
logger.warn("Unable to back up old bloom filter for " + outputFilePrefix + "/" + batchDir.getName());
}
}
AtomicIndex.generateTermMap(termsFile, termMapFile, bloomFile);
// create direct termmap and Bloom filter
File dirTermsFile = new File(batchDir, outputFilePrefix +
AtomicIndex.DIRECT_INDEX_NAME_SUFFIX +
DiskBasedIndex.TERMS_EXTENSION);
if(dirTermsFile.exists()) {
File dirTermMapFile = new File(batchDir, outputFilePrefix +
AtomicIndex.DIRECT_INDEX_NAME_SUFFIX +
DiskBasedIndex.TERMMAP_EXTENSION);
File dirBloomFile = new File(batchDir, outputFilePrefix +
AtomicIndex.DIRECT_INDEX_NAME_SUFFIX +
DocumentalCluster.BLOOM_EXTENSION);
if(dirTermMapFile.exists()) {
if(!dirTermMapFile.renameTo(new File(batchDir, outputFilePrefix +
AtomicIndex.DIRECT_INDEX_NAME_SUFFIX +
DiskBasedIndex.TERMMAP_EXTENSION + ".old"))) {
logger.warn("Unable to back up old direct termmap for " + outputFilePrefix + "/" + batchDir.getName());
}
}
if(dirBloomFile.exists()) {
if(!dirBloomFile.renameTo(new File(batchDir, outputFilePrefix +
AtomicIndex.DIRECT_INDEX_NAME_SUFFIX +
DocumentalCluster.BLOOM_EXTENSION + ".old"))) {
logger.warn("Unable to back up old direct bloom filter for " + outputFilePrefix + "/" + batchDir.getName());
}
}
AtomicIndex.generateTermMap(dirTermsFile, dirTermMapFile, dirBloomFile);
}
}
}
}
}