Module gatenlp.serialization.default_tweetv1
Module that implements the various ways of how to save and load documents and change logs.
Expand source code
"""
Module that implements the various ways of how to save and load documents and change logs.
"""
from collections import defaultdict
from gatenlp.document import Document
from gatenlp.utils import get_nested
from gatenlp.urlfileutils import is_url, get_str_from_url, get_bytes_from_url
import json
JSON_WRITE = "wt"
JSON_READ = "rt"
TWITTER_DEFAULT_INCLUDE_FIELDS = [
"id_str",
"user.id_str",
"user.screen_name",
"user.name" "created_at",
"is_quote_status",
"quote_count",
"retweet_count",
"favourite_count",
"favourited",
"retweeted",
"lang",
"$is_retweet_status",
"retweeted_status.user.screen_name",
]
class TweetV1Serializer:
@staticmethod
def doc2twitterv1dict(doc, annspec=None, prefix_sep=None):
d = doc.to_dict(annspec=annspec)
ret = {"full_text": doc.text}
ents = defaultdict(list)
for setname, annset in d.get("annotation_sets", {}).items():
for ann in annset.get("annotations", []):
anntype = ann["type"]
if prefix_sep is not None and setname != "":
anntype = setname + prefix_sep + anntype
annlist = ents[anntype]
twitterann = {
"indices": [ann["start"], ann["end"]]
}
twitterann.update(ann["features"])
annlist.append(twitterann)
ret["entities"] = ents
return ret
@staticmethod
def save(
clazz,
inst,
to_ext=None,
to_mem=None,
annspec=None,
prefix_sep=None,
**kwargs,
):
"""
Args:
clazz: the class of the object that gets saved
inst: the object to get saved
to_ext: where to save to, this should be a file path, only one of to_ext and to_mem should be specified
to_mem: if True, return a String serialization
offset_type: the offset type to use for saving, if None (default) use "p" (Python)
offset_mapper: the offset mapper to use, only needed if the type needs to get converted
annspec: which annotation sets and types to include, list of set names or (setanmes, types) tuples
prefix_types: if not None, prefix all types with the name of the annotation set the annotation comes from
and use the given string as the separator (can be the empty string for no seaparator).
For annotations from the default set the type stays unchanged.
**kwargs:
"""
d = TweetV1Serializer.doc2twitterv1dict(inst, annspec=annspec, prefix_sep=prefix_sep)
if to_mem:
return json.dumps(d)
else:
with open(to_ext, JSON_WRITE) as outfp:
json.dump(d, outfp)
@staticmethod
def load(
clazz,
from_ext=None,
from_mem=None,
include_fields=None,
include_entities=True,
include_quote=False,
outsetname="Original markups",
tweet_ann="Tweet",
):
"""
Load a tweet from Twitter JSON format.
IMPORTANT: this is still very experimental, will change in the future!
Args:
clazz: internal use
from_ext: the file/url to load from
from_mem: string to load from
include_fields: a list of fields to include where nested field names are dot-separated, e.g.
"user.location". All these fields are included using the nested field name in either the
features of the tweet annotation with the Type specified, or the features of the document
if `tweet_ann` is None.
include_entities: create annotations for the tweet entities in the set with outsetname
include_quote: if True, add the quoted tweet after an empty line and treat it as a separate
tweet just like the original tweet.
outset: the annotation set where to put entity annotations and the tweet annotation(s)
tweet_ann: the annotation type to use to span the tweet and contain all the features.
Returns:
document representing the tweet
"""
if from_ext is not None:
isurl, extstr = is_url(from_ext)
if isurl:
jsonstr = get_str_from_url(extstr, encoding="utf-8")
tweet = json.loads(jsonstr)
else:
with open(extstr, "rt", encoding="utf-8") as infp:
tweet = json.load(infp)
elif from_mem is not None:
tweet = json.loads(from_mem)
else:
raise Exception("Cannot load from None")
if tweet is None:
raise Exception("Could not decode Tweet JSON")
if tweet.get("truncated"):
text = get_nested(tweet, "extended_tweet.full_text")
else:
text = get_nested(tweet, "text")
if text is None:
raise Exception("No text field found")
quoted_status = None
if include_quote:
quoted_status = tweet.get("quoted_status")
if quoted_status is not None:
qtext = quoted_status.get("text", "")
text += "\n" + qtext
doc = Document(text)
anns = doc.annset(outsetname)
if tweet_ann:
ann = anns.add(0, len(text), tweet_ann)
features = ann.features
else:
features = doc.features
if include_fields is None:
include_fields = TWITTER_DEFAULT_INCLUDE_FIELDS
for field in include_fields:
if field.startswith("$"):
if field == "$is_retweet_status":
rs = get_nested(tweet, "retweeted_status", silent=True)
if rs is not None:
features[field] = True
continue
val = get_nested(tweet, field, silent=True)
if val is not None:
features[field] = val
if include_entities:
if tweet.get("truncated"):
entities = get_nested(tweet, "extended_tweet.entities", default={})
else:
entities = get_nested(tweet, "entities", default={})
for etype, elist in entities.items():
for ent in elist:
start, end = ent["indices"]
anns.add(start, end, etype)
# TODO: if we have a quoted_status, add features and entities from there:
# Essentially the same processing as for the original tweet, but at document offset
# len(tweet)+1 (2?)
return doc
Classes
class TweetV1Serializer
-
Expand source code
class TweetV1Serializer: @staticmethod def doc2twitterv1dict(doc, annspec=None, prefix_sep=None): d = doc.to_dict(annspec=annspec) ret = {"full_text": doc.text} ents = defaultdict(list) for setname, annset in d.get("annotation_sets", {}).items(): for ann in annset.get("annotations", []): anntype = ann["type"] if prefix_sep is not None and setname != "": anntype = setname + prefix_sep + anntype annlist = ents[anntype] twitterann = { "indices": [ann["start"], ann["end"]] } twitterann.update(ann["features"]) annlist.append(twitterann) ret["entities"] = ents return ret @staticmethod def save( clazz, inst, to_ext=None, to_mem=None, annspec=None, prefix_sep=None, **kwargs, ): """ Args: clazz: the class of the object that gets saved inst: the object to get saved to_ext: where to save to, this should be a file path, only one of to_ext and to_mem should be specified to_mem: if True, return a String serialization offset_type: the offset type to use for saving, if None (default) use "p" (Python) offset_mapper: the offset mapper to use, only needed if the type needs to get converted annspec: which annotation sets and types to include, list of set names or (setanmes, types) tuples prefix_types: if not None, prefix all types with the name of the annotation set the annotation comes from and use the given string as the separator (can be the empty string for no seaparator). For annotations from the default set the type stays unchanged. **kwargs: """ d = TweetV1Serializer.doc2twitterv1dict(inst, annspec=annspec, prefix_sep=prefix_sep) if to_mem: return json.dumps(d) else: with open(to_ext, JSON_WRITE) as outfp: json.dump(d, outfp) @staticmethod def load( clazz, from_ext=None, from_mem=None, include_fields=None, include_entities=True, include_quote=False, outsetname="Original markups", tweet_ann="Tweet", ): """ Load a tweet from Twitter JSON format. IMPORTANT: this is still very experimental, will change in the future! Args: clazz: internal use from_ext: the file/url to load from from_mem: string to load from include_fields: a list of fields to include where nested field names are dot-separated, e.g. "user.location". All these fields are included using the nested field name in either the features of the tweet annotation with the Type specified, or the features of the document if `tweet_ann` is None. include_entities: create annotations for the tweet entities in the set with outsetname include_quote: if True, add the quoted tweet after an empty line and treat it as a separate tweet just like the original tweet. outset: the annotation set where to put entity annotations and the tweet annotation(s) tweet_ann: the annotation type to use to span the tweet and contain all the features. Returns: document representing the tweet """ if from_ext is not None: isurl, extstr = is_url(from_ext) if isurl: jsonstr = get_str_from_url(extstr, encoding="utf-8") tweet = json.loads(jsonstr) else: with open(extstr, "rt", encoding="utf-8") as infp: tweet = json.load(infp) elif from_mem is not None: tweet = json.loads(from_mem) else: raise Exception("Cannot load from None") if tweet is None: raise Exception("Could not decode Tweet JSON") if tweet.get("truncated"): text = get_nested(tweet, "extended_tweet.full_text") else: text = get_nested(tweet, "text") if text is None: raise Exception("No text field found") quoted_status = None if include_quote: quoted_status = tweet.get("quoted_status") if quoted_status is not None: qtext = quoted_status.get("text", "") text += "\n" + qtext doc = Document(text) anns = doc.annset(outsetname) if tweet_ann: ann = anns.add(0, len(text), tweet_ann) features = ann.features else: features = doc.features if include_fields is None: include_fields = TWITTER_DEFAULT_INCLUDE_FIELDS for field in include_fields: if field.startswith("$"): if field == "$is_retweet_status": rs = get_nested(tweet, "retweeted_status", silent=True) if rs is not None: features[field] = True continue val = get_nested(tweet, field, silent=True) if val is not None: features[field] = val if include_entities: if tweet.get("truncated"): entities = get_nested(tweet, "extended_tweet.entities", default={}) else: entities = get_nested(tweet, "entities", default={}) for etype, elist in entities.items(): for ent in elist: start, end = ent["indices"] anns.add(start, end, etype) # TODO: if we have a quoted_status, add features and entities from there: # Essentially the same processing as for the original tweet, but at document offset # len(tweet)+1 (2?) return doc
Static methods
def doc2twitterv1dict(doc, annspec=None, prefix_sep=None)
-
Expand source code
@staticmethod def doc2twitterv1dict(doc, annspec=None, prefix_sep=None): d = doc.to_dict(annspec=annspec) ret = {"full_text": doc.text} ents = defaultdict(list) for setname, annset in d.get("annotation_sets", {}).items(): for ann in annset.get("annotations", []): anntype = ann["type"] if prefix_sep is not None and setname != "": anntype = setname + prefix_sep + anntype annlist = ents[anntype] twitterann = { "indices": [ann["start"], ann["end"]] } twitterann.update(ann["features"]) annlist.append(twitterann) ret["entities"] = ents return ret
def load(clazz, from_ext=None, from_mem=None, include_fields=None, include_entities=True, include_quote=False, outsetname='Original markups', tweet_ann='Tweet')
-
Load a tweet from Twitter JSON format.
IMPORTANT: this is still very experimental, will change in the future!
Args
clazz
- internal use
from_ext
- the file/url to load from
from_mem
- string to load from
include_fields
- a list of fields to include where nested field names are dot-separated, e.g.
"user.location". All these fields are included using the nested field name in either the
features of the tweet annotation with the Type specified, or the features of the document
if
tweet_ann
is None. include_entities
- create annotations for the tweet entities in the set with outsetname
include_quote
- if True, add the quoted tweet after an empty line and treat it as a separate tweet just like the original tweet.
outset
- the annotation set where to put entity annotations and the tweet annotation(s)
tweet_ann
- the annotation type to use to span the tweet and contain all the features.
Returns
document representing the tweet
Expand source code
@staticmethod def load( clazz, from_ext=None, from_mem=None, include_fields=None, include_entities=True, include_quote=False, outsetname="Original markups", tweet_ann="Tweet", ): """ Load a tweet from Twitter JSON format. IMPORTANT: this is still very experimental, will change in the future! Args: clazz: internal use from_ext: the file/url to load from from_mem: string to load from include_fields: a list of fields to include where nested field names are dot-separated, e.g. "user.location". All these fields are included using the nested field name in either the features of the tweet annotation with the Type specified, or the features of the document if `tweet_ann` is None. include_entities: create annotations for the tweet entities in the set with outsetname include_quote: if True, add the quoted tweet after an empty line and treat it as a separate tweet just like the original tweet. outset: the annotation set where to put entity annotations and the tweet annotation(s) tweet_ann: the annotation type to use to span the tweet and contain all the features. Returns: document representing the tweet """ if from_ext is not None: isurl, extstr = is_url(from_ext) if isurl: jsonstr = get_str_from_url(extstr, encoding="utf-8") tweet = json.loads(jsonstr) else: with open(extstr, "rt", encoding="utf-8") as infp: tweet = json.load(infp) elif from_mem is not None: tweet = json.loads(from_mem) else: raise Exception("Cannot load from None") if tweet is None: raise Exception("Could not decode Tweet JSON") if tweet.get("truncated"): text = get_nested(tweet, "extended_tweet.full_text") else: text = get_nested(tweet, "text") if text is None: raise Exception("No text field found") quoted_status = None if include_quote: quoted_status = tweet.get("quoted_status") if quoted_status is not None: qtext = quoted_status.get("text", "") text += "\n" + qtext doc = Document(text) anns = doc.annset(outsetname) if tweet_ann: ann = anns.add(0, len(text), tweet_ann) features = ann.features else: features = doc.features if include_fields is None: include_fields = TWITTER_DEFAULT_INCLUDE_FIELDS for field in include_fields: if field.startswith("$"): if field == "$is_retweet_status": rs = get_nested(tweet, "retweeted_status", silent=True) if rs is not None: features[field] = True continue val = get_nested(tweet, field, silent=True) if val is not None: features[field] = val if include_entities: if tweet.get("truncated"): entities = get_nested(tweet, "extended_tweet.entities", default={}) else: entities = get_nested(tweet, "entities", default={}) for etype, elist in entities.items(): for ent in elist: start, end = ent["indices"] anns.add(start, end, etype) # TODO: if we have a quoted_status, add features and entities from there: # Essentially the same processing as for the original tweet, but at document offset # len(tweet)+1 (2?) return doc
def save(clazz, inst, to_ext=None, to_mem=None, annspec=None, prefix_sep=None, **kwargs)
-
Args
clazz: the class of the object that gets saved inst: the object to get saved to_ext: where to save to, this should be a file path, only one of to_ext and to_mem should be specified to_mem: if True, return a String serialization offset_type: the offset type to use for saving, if None (default) use "p" (Python) offset_mapper: the offset mapper to use, only needed if the type needs to get converted annspec: which annotation sets and types to include, list of set names or (setanmes, types) tuples prefix_types: if not None, prefix all types with the name of the annotation set the annotation comes from and use the given string as the separator (can be the empty string for no seaparator). For annotations from the default set the type stays unchanged. **kwargs:
Expand source code
@staticmethod def save( clazz, inst, to_ext=None, to_mem=None, annspec=None, prefix_sep=None, **kwargs, ): """ Args: clazz: the class of the object that gets saved inst: the object to get saved to_ext: where to save to, this should be a file path, only one of to_ext and to_mem should be specified to_mem: if True, return a String serialization offset_type: the offset type to use for saving, if None (default) use "p" (Python) offset_mapper: the offset mapper to use, only needed if the type needs to get converted annspec: which annotation sets and types to include, list of set names or (setanmes, types) tuples prefix_types: if not None, prefix all types with the name of the annotation set the annotation comes from and use the given string as the separator (can be the empty string for no seaparator). For annotations from the default set the type stays unchanged. **kwargs: """ d = TweetV1Serializer.doc2twitterv1dict(inst, annspec=annspec, prefix_sep=prefix_sep) if to_mem: return json.dumps(d) else: with open(to_ext, JSON_WRITE) as outfp: json.dump(d, outfp)