@CreoleResource(name="GATE HTML Document Format", isPrivate=true, autoinstances=) public class NekoHtmlDocumentFormat extends TextualDocumentFormat
DocumentFormat that uses Andy Clark's NekoHTML parser to parse HTML documents. It tries to render HTML in a similar way to a web browser, i.e. whitespace is normalized, paragraphs are separated by a blank line, etc. By default the text content of style and script tags is ignored completely, though the set of tags treated in this way is configurable via a CREOLE parameter.
element2StringMap, magic2mimeTypeMap, markupElementsMap, mimeString2ClassHandlerMap, mimeString2mimeTypeMap, suffixes2mimeTypeMap
dataStore, lrPersistentId
name
Constructor and Description |
---|
NekoHtmlDocumentFormat()
Default construction
|
Modifier and Type | Method and Description |
---|---|
Set<String> |
getIgnorableTags() |
Resource |
init()
Initialise this resource, and return it.
|
void |
setIgnorableTags(Set<String> newTags) |
Boolean |
supportsRepositioning()
We support repositioning info for HTML files.
|
void |
unpackMarkup(Document doc)
Old-style unpackMarkup, without repositioning info.
|
void |
unpackMarkup(Document doc,
RepositioningInfo repInfo,
RepositioningInfo ampCodingInfo)
Unpack the markup in the document.
|
annotateParagraphs, getDataStore, hasContentButNoValidUrl, setNewLineProperty
addStatusListener, areEqual, decideBetweenThreeMimeTypes, decideBetweenTwoMimeTypes, fireStatusChanged, getDocumentFormat, getDocumentFormat, getDocumentFormat, getDocumentFormat, getElement2StringMap, getFeatures, getMarkupElementsMap, getMimeType, getMimeTypeForString, getShouldCollectRepositioning, getSupportedFileSuffixes, getSupportedMimeTypes, guessTypeUsingMagicNumbers, removeStatusListener, runMagicNumbers, setElement2StringMap, setFeatures, setMarkupElementsMap, setMimeType, setShouldCollectRepositioning, unpackMarkup, willReadFromUrl
cleanup, getLRPersistenceId, getParent, isModified, setDataStore, setLRPersistenceId, setParent, sync
checkParameterValues, flushBeanInfoCache, forgetBeanInfo, getBeanInfo, getInitParameterValues, getInitParameterValues, getName, getParameterValue, getParameterValue, getParameterValues, removeResourceListeners, setName, setParameterValue, setParameterValue, setParameterValues, setParameterValues, setResourceListeners, toString
clone, equals, finalize, getClass, hashCode, notify, notifyAll, wait, wait, wait
getParameterValue, setParameterValue, setParameterValues
getName, setName
@CreoleParameter(comment="HTML tags whose text content should be ignored", defaultValue="script;style;iframe") public void setIgnorableTags(Set<String> newTags)
public Boolean supportsRepositioning()
supportsRepositioning
in class DocumentFormat
public void unpackMarkup(Document doc) throws DocumentFormatException
unpackMarkup
in class TextualDocumentFormat
DocumentFormatException
public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException
unpackMarkup
in class TextualDocumentFormat
doc
- The gate document you want to parse. If
doc.getSourceUrl()
returns null
then the content of doc will be parsed. Using a URL is
recomended because the parser will report errors corectlly
if the document is not well formed.DocumentFormatException
public Resource init() throws ResourceInstantiationException
init
in interface Resource
init
in class TextualDocumentFormat
ResourceInstantiationException
Copyright © 2024 GATE. All rights reserved.