public class NekoHtmlDocumentHandler extends Object implements org.apache.xerces.xni.XMLDocumentHandler, org.apache.xerces.xni.parser.XMLErrorHandler
Modifier and Type | Field and Description |
---|---|
protected boolean |
addSpaceOnUnpack
Initialised from the user config, stores whether to add extra space
characters to separate words that would otherwise be run together,
e.g.
|
static String |
AUGMENTATIONS |
protected int |
customObjectsId |
protected List<StatusListener> |
myStatusListeners |
protected boolean |
previousChunkEndedWithWS
During parsing, keeps track of whether the previous chunk of
character data ended with a whitespace character.
|
Constructor and Description |
---|
NekoHtmlDocumentHandler(Document aDocument,
AnnotationSet anAnnotationSet,
Set<String> ignorableTags)
Constructor initialises all the private memeber data
|
Modifier and Type | Method and Description |
---|---|
void |
addRepositioningInfo(int contentLength,
int pos,
int extractedPos)
For given content the list with shrink position information is
searched and on the corresponding positions the correct
repositioning information is calculated and generated.
|
void |
addStatusListener(StatusListener listener) |
void |
characters(org.apache.xerces.xni.XMLString text,
org.apache.xerces.xni.Augmentations augs)
Called when the parser encounters character or CDATA content.
|
void |
charactersAction()
Called when all text between two tags has been processed.
|
void |
comment(org.apache.xerces.xni.XMLString content,
org.apache.xerces.xni.Augmentations augs) |
protected void |
customizeAppearanceOfDocumentWithEndTag(String tagName)
This method analizes the tag t and adds some \n chars and spaces to
the tmpDocContent.The reason behind is that we need to have a
readable form for the final document.
|
protected void |
customizeAppearanceOfDocumentWithStartTag(String tagName)
This method analizes the tag t and adds some \n chars and spaces to
the tmpDocContent.The reason behind is that we need to have a
readable form for the final document.
|
void |
doctypeDecl(String arg0,
String arg1,
String arg2,
org.apache.xerces.xni.Augmentations arg3) |
void |
emptyElement(org.apache.xerces.xni.QName element,
org.apache.xerces.xni.XMLAttributes attributes,
org.apache.xerces.xni.Augmentations augs)
Called to signal an empty element.
|
void |
endCDATA(org.apache.xerces.xni.Augmentations augs) |
void |
endDocument(org.apache.xerces.xni.Augmentations augs)
Called when the parser reaches the end of the document.
|
void |
endElement(org.apache.xerces.xni.QName element,
org.apache.xerces.xni.Augmentations augs)
Called when the parser encounters the end of an element.
|
void |
endElement(org.apache.xerces.xni.QName element,
org.apache.xerces.xni.Augmentations augs,
boolean wasEmptyElement)
Called when the parser encounters the end of an HTML element.
|
void |
endGeneralEntity(String arg0,
org.apache.xerces.xni.Augmentations arg1) |
void |
error(String domain,
String key,
org.apache.xerces.xni.parser.XMLParseException e)
Non-fatal error, print the stack trace but continue processing.
|
void |
fatalError(String domain,
String key,
org.apache.xerces.xni.parser.XMLParseException e) |
protected void |
fireStatusChangedEvent(String text) |
RepositioningInfo |
getAmpCodingInfo()
Return current RepositioningInfo object for ampersand coding.
|
int |
getCustomObjectsId() |
org.apache.xerces.xni.parser.XMLDocumentSource |
getDocumentSource() |
Set<String> |
getIgnorableTags()
Get the set of tag names whose content is ignored by this handler.
|
RepositioningInfo |
getRepositioningInfo()
Return current RepositioningInfo object
|
void |
ignorableWhitespace(org.apache.xerces.xni.XMLString arg0,
org.apache.xerces.xni.Augmentations arg1) |
void |
processingInstruction(String target,
org.apache.xerces.xni.XMLString data,
org.apache.xerces.xni.Augmentations augs) |
void |
removeStatusListener(StatusListener listener) |
void |
setAmpCodingInfo(RepositioningInfo info)
Set repositioning information structure refference for ampersand
coding.
|
void |
setDocumentSource(org.apache.xerces.xni.parser.XMLDocumentSource arg0) |
void |
setIgnorableTags(Set<String> newTags)
Set the set of tag names whose text content will be ignored.
|
void |
setLineOffsets(int[] lineOffsets)
Set the array of line offsets.
|
void |
setRepositioningInfo(RepositioningInfo info)
Set repositioning information structure refference.
|
void |
startCDATA(org.apache.xerces.xni.Augmentations augs) |
void |
startDocument(org.apache.xerces.xni.XMLLocator arg0,
String arg1,
org.apache.xerces.xni.NamespaceContext arg2,
org.apache.xerces.xni.Augmentations arg3) |
void |
startElement(org.apache.xerces.xni.QName element,
org.apache.xerces.xni.XMLAttributes attributes,
org.apache.xerces.xni.Augmentations augs)
Called when the parser encounters the start of an HTML element.
|
void |
startGeneralEntity(String arg0,
org.apache.xerces.xni.XMLResourceIdentifier arg1,
String arg2,
org.apache.xerces.xni.Augmentations arg3) |
void |
textDecl(String arg0,
String arg1,
org.apache.xerces.xni.Augmentations arg2) |
void |
warning(String arg0,
String arg1,
org.apache.xerces.xni.parser.XMLParseException arg2) |
void |
xmlDecl(String arg0,
String arg1,
String arg2,
org.apache.xerces.xni.Augmentations arg3) |
public static final String AUGMENTATIONS
protected List<StatusListener> myStatusListeners
protected int customObjectsId
protected boolean addSpaceOnUnpack
protected boolean previousChunkEndedWithWS
public NekoHtmlDocumentHandler(Document aDocument, AnnotationSet anAnnotationSet, Set<String> ignorableTags)
aDocument
- The gate document that will be processedanAnnotationSet
- The annotation set that will contain
annotations resulted from the processing of the gate
documentignorableTags
- HTML tag names (lower case) whose text content
should be ignored by this handler.public void setLineOffsets(int[] lineOffsets)
public void startElement(org.apache.xerces.xni.QName element, org.apache.xerces.xni.XMLAttributes attributes, org.apache.xerces.xni.Augmentations augs) throws org.apache.xerces.xni.XNIException
endElement(org.apache.xerces.xni.QName, org.apache.xerces.xni.Augmentations)
.startElement
in interface org.apache.xerces.xni.XMLDocumentHandler
org.apache.xerces.xni.XNIException
public void characters(org.apache.xerces.xni.XMLString text, org.apache.xerces.xni.Augmentations augs) throws org.apache.xerces.xni.XNIException
characters
in interface org.apache.xerces.xni.XMLDocumentHandler
org.apache.xerces.xni.XNIException
public void charactersAction() throws org.apache.xerces.xni.XNIException
org.apache.xerces.xni.XNIException
public void endElement(org.apache.xerces.xni.QName element, org.apache.xerces.xni.Augmentations augs) throws org.apache.xerces.xni.XNIException
endElement
in interface org.apache.xerces.xni.XMLDocumentHandler
org.apache.xerces.xni.XNIException
public void emptyElement(org.apache.xerces.xni.QName element, org.apache.xerces.xni.XMLAttributes attributes, org.apache.xerces.xni.Augmentations augs) throws org.apache.xerces.xni.XNIException
emptyElement
in interface org.apache.xerces.xni.XMLDocumentHandler
org.apache.xerces.xni.XNIException
public void endElement(org.apache.xerces.xni.QName element, org.apache.xerces.xni.Augmentations augs, boolean wasEmptyElement) throws org.apache.xerces.xni.XNIException
org.apache.xerces.xni.XNIException
public void endDocument(org.apache.xerces.xni.Augmentations augs) throws org.apache.xerces.xni.XNIException
endDocument
in interface org.apache.xerces.xni.XMLDocumentHandler
org.apache.xerces.xni.XNIException
public void error(String domain, String key, org.apache.xerces.xni.parser.XMLParseException e)
error
in interface org.apache.xerces.xni.parser.XMLErrorHandler
public void fatalError(String domain, String key, org.apache.xerces.xni.parser.XMLParseException e) throws org.apache.xerces.xni.XNIException
fatalError
in interface org.apache.xerces.xni.parser.XMLErrorHandler
org.apache.xerces.xni.XNIException
public void processingInstruction(String target, org.apache.xerces.xni.XMLString data, org.apache.xerces.xni.Augmentations augs) throws org.apache.xerces.xni.XNIException
processingInstruction
in interface org.apache.xerces.xni.XMLDocumentHandler
org.apache.xerces.xni.XNIException
public void comment(org.apache.xerces.xni.XMLString content, org.apache.xerces.xni.Augmentations augs) throws org.apache.xerces.xni.XNIException
comment
in interface org.apache.xerces.xni.XMLDocumentHandler
org.apache.xerces.xni.XNIException
public void startCDATA(org.apache.xerces.xni.Augmentations augs) throws org.apache.xerces.xni.XNIException
startCDATA
in interface org.apache.xerces.xni.XMLDocumentHandler
org.apache.xerces.xni.XNIException
public void endCDATA(org.apache.xerces.xni.Augmentations augs) throws org.apache.xerces.xni.XNIException
endCDATA
in interface org.apache.xerces.xni.XMLDocumentHandler
org.apache.xerces.xni.XNIException
public void addRepositioningInfo(int contentLength, int pos, int extractedPos)
protected void customizeAppearanceOfDocumentWithStartTag(String tagName)
tagName
- the Html tag encounted by the HTML parserprotected void customizeAppearanceOfDocumentWithEndTag(String tagName)
tagName
- the Html tag encounted by the HTML parserpublic void setRepositioningInfo(RepositioningInfo info)
public RepositioningInfo getRepositioningInfo()
public void setAmpCodingInfo(RepositioningInfo info)
public RepositioningInfo getAmpCodingInfo()
public void setIgnorableTags(Set<String> newTags)
newTags
- a set of lower-case tag namespublic Set<String> getIgnorableTags()
public int getCustomObjectsId()
public void addStatusListener(StatusListener listener)
public void removeStatusListener(StatusListener listener)
protected void fireStatusChangedEvent(String text)
public void doctypeDecl(String arg0, String arg1, String arg2, org.apache.xerces.xni.Augmentations arg3) throws org.apache.xerces.xni.XNIException
doctypeDecl
in interface org.apache.xerces.xni.XMLDocumentHandler
org.apache.xerces.xni.XNIException
public void endGeneralEntity(String arg0, org.apache.xerces.xni.Augmentations arg1) throws org.apache.xerces.xni.XNIException
endGeneralEntity
in interface org.apache.xerces.xni.XMLDocumentHandler
org.apache.xerces.xni.XNIException
public org.apache.xerces.xni.parser.XMLDocumentSource getDocumentSource()
getDocumentSource
in interface org.apache.xerces.xni.XMLDocumentHandler
public void ignorableWhitespace(org.apache.xerces.xni.XMLString arg0, org.apache.xerces.xni.Augmentations arg1) throws org.apache.xerces.xni.XNIException
ignorableWhitespace
in interface org.apache.xerces.xni.XMLDocumentHandler
org.apache.xerces.xni.XNIException
public void setDocumentSource(org.apache.xerces.xni.parser.XMLDocumentSource arg0)
setDocumentSource
in interface org.apache.xerces.xni.XMLDocumentHandler
public void startDocument(org.apache.xerces.xni.XMLLocator arg0, String arg1, org.apache.xerces.xni.NamespaceContext arg2, org.apache.xerces.xni.Augmentations arg3) throws org.apache.xerces.xni.XNIException
startDocument
in interface org.apache.xerces.xni.XMLDocumentHandler
org.apache.xerces.xni.XNIException
public void startGeneralEntity(String arg0, org.apache.xerces.xni.XMLResourceIdentifier arg1, String arg2, org.apache.xerces.xni.Augmentations arg3) throws org.apache.xerces.xni.XNIException
startGeneralEntity
in interface org.apache.xerces.xni.XMLDocumentHandler
org.apache.xerces.xni.XNIException
public void textDecl(String arg0, String arg1, org.apache.xerces.xni.Augmentations arg2) throws org.apache.xerces.xni.XNIException
textDecl
in interface org.apache.xerces.xni.XMLDocumentHandler
org.apache.xerces.xni.XNIException
public void xmlDecl(String arg0, String arg1, String arg2, org.apache.xerces.xni.Augmentations arg3) throws org.apache.xerces.xni.XNIException
xmlDecl
in interface org.apache.xerces.xni.XMLDocumentHandler
org.apache.xerces.xni.XNIException
Copyright © 2024 GATE. All rights reserved.