from __future__ import print_function
"""
corpkit: Corpus and Corpus-like objects
"""
from lazyprop import lazyprop
from corpkit.process import classname
[docs]class Corpus(object):
"""
A class representing a linguistic text corpus, which contains files,
optionally within subcorpus folders.
Methods for concordancing, interrogating, getting general stats, getting
behaviour of particular word, etc.
"""
def __init__(self, path, **kwargs):
import re
import operator
import glob
import os
from os.path import join, isfile, isdir, abspath, dirname, basename
from corpkit.process import determine_datatype
# levels are 'c' for corpus, 's' for subcorpus and 'f' for file. Which
# one is determined automatically below, and processed accordingly. We
# assume it is a full corpus to begin with.
self.data = None
level = kwargs.pop('level', 'c')
self.datatype = kwargs.pop('datatype', None)
print_info = kwargs.get('print_info', True)
if isinstance(path, Datalist) or isinstance(path, list):
self.path = abspath(dirname(path[0].path.rstrip('/')))
self.name = basename(self.path)
self.data = path
else:
self.path = abspath(path)
self.name = basename(path)
# this messy code figures out as quickly as possible what the datatype
# and singlefile status of the path is. it's messy because it shortcuts
# full checking where possible some of the shortcutting could maybe be
# moved into the determine_datatype() funct.
self.singlefile = False
if os.path.isfile(self.path):
if self.path.endswith('.xml'):
self.datatype = 'parse'
self.singlefile = True
elif self.path.endswith('-parsed'):
if not isdir(self.path):
if isdir(join('data', path)):
self.path = abspath(join('data', path))
self.datatype = 'parse'
if len([d for d in os.listdir(self.path)
if isdir(join(self.path, d))]) > 0:
self.singlefile = False
if len([d for d in os.listdir(self.path)
if isdir(join(self.path, d))]) == 0:
level = 's'
else:
if level == 'c':
if not self.datatype:
self.datatype, self.singlefile = determine_datatype(
self.path)
if len([d for d in os.listdir(self.path)
if isdir(join(self.path, d))]) == 0:
level = 's'
# if initialised on a file, process as file
if self.singlefile and level == 'c':
level = 'f'
self.level = level
# load each interrogation as an attribute
if kwargs.get('load_saved', False):
from corpkit.other import load
from corpkit.process import makesafe
if os.path.isdir('saved_interrogations'):
saved_files = glob.glob(r'saved_interrogations/*')
for filepath in saved_files:
filename = os.path.basename(filepath)
if not filename.startswith(self.name):
continue
not_filename = filename.replace(self.name + '-', '')
not_filename = os.path.splitext(not_filename)[0]
if not_filename in ['features', 'wordclasses', 'postags']:
continue
variable_safe = makesafe(not_filename)
try:
setattr(self, variable_safe, load(filename))
if print_info:
print(
'\tLoaded %s as %s attribute.' %
(filename, variable_safe))
except AttributeError:
if print_info:
print(
'\tFailed to load %s as %s attribute. Name conflict?' %
(filename, variable_safe))
if print_info:
print('Corpus: %s' % self.path)
# these two are duplicated from the file object. not good.
@lazyprop
def document(self):
"""Return the parsed XML of a parsed file"""
if self.level != 'f':
raise ValueError('Can only access document for File')
from corenlp_xml.document import Document
return Document(self.read())
[docs] def read(self, **kwargs):
"""Read file data. If data is pickled, unpickle first
:returns: str/unpickled data
"""
if self.level != 'f':
raise ValueError('Can only call read method on File')
if self.datatype == 'tokens':
import pickle
with open(self.path, "rb", **kwargs) as openfile:
data = pickle.load(openfile)
return data
else:
with open(self.path, 'r', **kwargs) as openfile:
data = openfile.read()
return data
@lazyprop
def subcorpora(self):
"""A list-like object containing a corpus' subcorpora."""
import re
import os
import operator
from os.path import join, isdir
if self.data.__class__ == Datalist or isinstance(self.data, list):
return self.data
if self.level == 'c':
variable_safe_r = re.compile(r'[\W0-9_]+', re.UNICODE)
sbs = Datalist(sorted([Subcorpus(join(self.path, d), self.datatype)
for d in os.listdir(self.path)
if isdir(join(self.path, d))],
key=operator.attrgetter('name')))
for subcorpus in sbs:
variable_safe = re.sub(variable_safe_r, '',
subcorpus.name.lower().split(',')[0])
setattr(self, variable_safe, subcorpus)
return sbs
@lazyprop
def speakerlist(self):
"""A list of speakers in the corpus"""
from corpkit.build import get_speaker_names_from_xml_corpus
return get_speaker_names_from_xml_corpus(self.path)
@lazyprop
def files(self):
"""A list-like object containing the files in a folder
>>> corpus.subcorpora[0].files
"""
import re
import os
import operator
from os.path import join, isdir
if self.level == 's':
fls = [f for f in os.listdir(self.path) if not f.startswith('.')]
fls = [File(f, self.path, self.datatype) for f in fls]
fls = sorted(fls, key=operator.attrgetter('name'))
return Datalist(fls)
def __str__(self):
"""String representation of corpus"""
show = 'Corpus at %s:\n\nData type: %s\nNumber of subcorpora: %d\n' % (
self.path, self.datatype, len(self.subcorpora))
if self.singlefile:
show += '\nCorpus is a single file.\n'
if 'features' in self.__dict__.keys():
if not self.singlefile:
cols = list(self.features.columns)[:10]
show += '\nFeatures:\n\n' + \
self.features.head(10).to_string(columns=cols)
else:
show += '\nFeatures:\n\n' + \
self.features.head(10).to_string()
else:
show += '\nFeatures not analysed yet. Use .features to calculate them.\n'
return show
def __repr__(self):
"""object representation of corpus"""
import os
if not self.subcorpora:
ssubcorpora = ''
else:
ssubcorpora = self.subcorpora
return "<%s instance: %s; %d subcorpora>" % (
classname(self), os.path.basename(self.path), len(ssubcorpora))
def __getitem__(self, key):
from corpkit.process import makesafe
if isinstance(key, slice):
# Get the start, stop, and step from the slice
return Datalist([self[ii] for ii in range(
*key.indices(len(self.subcorpora)))])
elif isinstance(key, int):
return self.subcorpora.__getitem__(makesafe(self.subcorpora[key]))
else:
try:
return self.subcorpora.__getattribute__(key)
except:
from corpkit.process import is_number
if is_number(key):
return self.__getattribute__('c' + key)
# METHODS
@lazyprop
def features(self):
"""
Generate and show basic stats from the corpus, including number of
sentences, clauses, process types, etc.
:Example:
>>> corpus.features
SB Characters Tokens Words Closed class words Open class words Clauses
01 26873 8513 7308 4809 3704 2212
02 25844 7933 6920 4313 3620 2270
03 18376 5683 4877 3067 2616 1640
04 20066 6354 5366 3587 2767 1775
"""
import os
from os.path import isfile, isdir, join
from corpkit.interrogator import interrogator
from corpkit.other import load
from corpkit.dictionaries import mergetags
savedir = 'saved_interrogations'
if isfile(join(savedir, self.name + '-features.p')):
try:
return load(self.name + '-features').results
except AttributeError:
return load(self.name + '-features')
else:
feat = interrogator(self, 's', 'any').results
if isdir(savedir):
feat.save(self.name + '-features')
return feat
@lazyprop
def wordclasses(self):
"""
Generate and show basic stats from the corpus, including number of
sentences, clauses, process types, etc.
:Example:
>>> corpus.wordclasses
SB Characters Tokens Words Closed class words Open class words Clauses
01 26873 8513 7308 4809 3704 2212
02 25844 7933 6920 4313 3620 2270
03 18376 5683 4877 3067 2616 1640
04 20066 6354 5366 3587 2767 1775
"""
import os
from os.path import isfile, isdir, join
from corpkit.interrogator import interrogator
from corpkit.other import load
from corpkit.dictionaries import mergetags
savedir = 'saved_interrogations'
if isfile(join(savedir, self.name + '-wordclasses.p')):
try:
return load(self.name + '-wordclasses').results
except AttributeError:
return load(self.name + '-wordclasses')
elif isfile(join(savedir, self.name + '-postags.p')):
try:
posdata = load(self.name + '-postags').results
except AttributeError:
posdata = load(self.name + '-postags')
return posdata.edit(
merge_entries=mergetags,
sort_by='total').results
else:
feat = interrogator(self, 't', 'any', show='pl').results
if isdir(savedir):
feat.save(self.name + '-wordclasses')
return feat
@lazyprop
def postags(self):
"""
Generate and show basic stats from the corpus, including number of
sentences, clauses, process types, etc.
:Example:
>>> corpus.postags
SB NN VB JJ IN DT wo Open class words Clauses
01 26873 8513 7308 4809 3704 2212
02 25844 7933 6920 4313 3620 2270
03 18376 5683 4877 3067 2616 1640
04 20066 6354 5366 3587 2767 1775
"""
import os
from os.path import isfile, isdir, join
from corpkit.interrogator import interrogator
from corpkit.other import load
from corpkit.dictionaries import mergetags
savedir = 'saved_interrogations'
if isfile(join(savedir, self.name + '-postags.p')):
try:
return load(self.name + '-postags').results
except AttributeError:
return load(self.name + '-postags')
else:
feat = interrogator(self, 't', 'any', show='p').results
if isdir(savedir):
feat.save(self.name + '-postags')
wordclss = feat.edit(
merge_entries=mergetags,
sort_by='total').results
wordclss.save(self.name + '-wordclasses')
return feat
[docs] def configurations(self, search, **kwargs):
"""
Get the overall behaviour of tokens or lemmas matching a regular
expression. The search below makes DataFrames containing the most
common subjects, objects, modifiers (etc.) of 'see':
:param search: Similar to `search` in the `interrogate()` /
`concordance() methods. `W`/`L keys match word or lemma; `F`: key
specifies semantic role (`'participant'`, `'process'` or `'modifier'`.
If `F` not specified, each role will be searched for.
:type search: dict
:Example:
>>> see = corpus.configurations({L: 'see', F: 'process'}, show = L)
>>> see.has_subject.results.sum()
i 452
it 227
you 162
we 111
he 94
:returns: :class:`corpkit.interrogation.Interrodict`
"""
from corpkit.configurations import configurations
return configurations(self, search, **kwargs)
[docs] def interrogate(self, search, *args, **kwargs):
"""
Interrogate a corpus of texts for a lexicogrammatical phenomenon.
This method iterates over the files/folders in a corpus, searching the
texts, and returning a :class:`corpkit.interrogation.Interrogation`
object containing the results. The main options are `search`, where you
specify search criteria, and `show`, where you specify what you want to
appear in the output.
:Example:
>>> corpus = Corpus('data/conversations-parsed')
### show lemma form of nouns ending in 'ing'
>>> q = {W: r'ing$', P: r'^N'}
>>> data = corpus.interrogate(q, show = L)
>>> data.results
.. something anything thing feeling everything nothing morning
01 14 11 12 1 6 0 1
02 10 20 4 4 8 3 0
03 14 5 5 3 1 0 0
... ...
:param search: What the query should be matching.
- t: tree
- w: word
- l: lemma
- p: pos
- f: function
- g/gw: governor
- gl: governor's lemma form
- gp: governor's pos
- gf: governor's function
- d/dependent
- dl: dependent's lemma form
- dp: dependent's pos
- df: dependent's function
- i/index
- n/ngrams (deprecated, use ``show``)
- s/general stats
:type search: str or dict. dict is used when you have multiple criteria.
Keys are what to search as `str`, and values are the criteria, which is
a Tregex query, a regex, or a list of words to match. Therefore, the two
syntaxes below do the same thing:
:Example:
>>> corpus.interrogate(T, r'/NN.?/')
>>> corpus.interrogate({T: r'/NN.?/'})
:param searchmode: Return results matching any/all criteria
:type searchmode: str -- `'any'`/`'all'`
:param exclude: The inverse of `search`, removing results from search
:type exclude: dict -- `{L: 'be'}`
:param excludemode: Exclude results matching any/all criteria
:type excludemode: str -- `'any'`/`'all'`
:param query: A search query for the interrogation. This is only used
when `search` is a string, or when multiprocessing. If `search` is a
`dict`, the query/queries are stored there as the values instead. When
multiprocessing, the following is possible:
:Example:
>>> {'Nouns': r'/NN.?/', 'Verbs': r'/VB.?/'}
### return an :class:`corpkit.interrogation.Interrodict` object:
>>> corpus.interrogate(T, q)
### return an :class:`corpkit.interrogation.Interrogation` object:
>>> corpus.interrogate(T, q, show = C)
:type query:
- str -- regex/Tregex pattern (use when `search` is a `str`)
- dict -- `{name: pattern}` (as per example above)
- list -- word list to match
:param show: What to output. If multiple strings are passed in as a ``list``, results
will be colon-separated, in the suppled order. If you want to show ngrams, you can't
have multiple values. Possible values are the same as those for ``search``, plus:
- a/distance from root
- n/ngram
- nl/ngram lemma
- np/ngram POS
- npl/ngram wordclass
:type show: ``str``/``list`` of strings
:param lemmatise: Force lemmatisation on results. Deprecated:
instead, output a lemma form with the `show` argument
:type lemmatise: bool
:param lemmatag: Explicitly pass a pos to lemmatiser (generally when data is unparsed),
or when tag cannot be recovered from Tregex query
:type lemmatag: False/'n'/'v'/'a'/'r'
:param spelling: Convert all to U.S. or U.K. English
:type spelling: False/'US'/'UK'
:param dep_type: The kind of Stanford CoreNLP dependency parses you want to use
:type dep_type: str -- 'basic-dependencies'/'a',
'collapsed-dependencies'/'b', 'collapsed-ccprocessed-dependencies'/'c'
:param save: Save result as pickle to `saved_interrogations/<save>` on completion
:type save: str
:param gramsize: size of n-grams (default 2)
:type gramsize: int
:param split_contractions: make `"don't"` et al into two tokens
:type split_contractions: bool
:param multiprocess: how many parallel processes to run
:type multiprocess: int / bool (to determine automatically)
:param files_as_subcorpora: treat each file as a subcorpus
:type files_as_subcorpora: bool
:param do_concordancing: Concordance while interrogating, store as `.concordance` attribute
:type do_concordancing: bool/'only'
:param maxconc: Maximum number of concordance lines
:type maxconc: int
:returns: A :class:`corpkit.interrogation.Interrogation` object, with
`.query`, `.results`, `.totals` attributes. If multiprocessing is
invoked, result may be a :class:`corpkit.interrogation.Interrodict`
containing corpus names, queries or speakers as keys.
"""
from corpkit.interrogator import interrogator
par = kwargs.pop('multiprocess', None)
kwargs.pop('corpus', None)
if par and self.subcorpora:
if isinstance(par, int):
kwargs['multiprocess'] = par
return interrogator(self.subcorpora, search, *args, **kwargs)
else:
return interrogator(self, search, *args, **kwargs)
[docs] def parse(self, corenlppath=False, operations=False, copula_head=True,
speaker_segmentation=False, memory_mb=False, multiprocess=False,
split_texts=400, *args, **kwargs):
"""
Parse an unparsed corpus, saving to disk
:param corenlppath: folder containing corenlp jar files (use if *corpkit* can't find
it automatically)
:type corenlppath: str
:param operations: which kinds of annotations to do
:type operations: str
:param speaker_segmentation: add speaker name to parser output if your corpus is
script-like
:type speaker_segmentation: bool
:param memory_mb: Amount of memory in MB for parser
:type memory_mb: int
:param copula_head: Make copula head in dependency parse
:type copula_head: bool
:param multiprocess: Split parsing across n cores (for high-performance computers)
:type multiprocess: int
:Example:
>>> parsed = corpus.parse(speaker_segmentation = True)
>>> parsed
<corpkit.corpus.Corpus instance: speeches-parsed; 9 subcorpora>
:returns: The newly created :class:`corpkit.corpus.Corpus`
"""
from corpkit.make import make_corpus
#from corpkit.process import determine_datatype
#dtype, singlefile = determine_datatype(self.path)
if self.datatype != 'plaintext':
raise ValueError(
'parse method can only be used on plaintext corpora.')
kwargs.pop('parse', None)
kwargs.pop('tokenise', None)
return Corpus(
make_corpus(
self.path,
parse=True,
tokenise=False,
corenlppath=corenlppath,
operations=operations,
copula_head=copula_head,
speaker_segmentation=speaker_segmentation,
memory_mb=memory_mb,
multiprocess=multiprocess,
split_texts=400,
*args,
**kwargs))
[docs] def tokenise(self, *args, **kwargs):
"""
Tokenise a plaintext corpus, saving to disk
:param nltk_data_path: path to tokeniser if not found automatically
:type nltk_data_path: str
:Example:
>>> tok = corpus.tokenise()
>>> tok
<corpkit.corpus.Corpus instance: speeches-tokenised; 9 subcorpora>
:returns: The newly created :class:`corpkit.corpus.Corpus`
"""
from corpkit.make import make_corpus
#from corpkit.process import determine_datatype
#dtype, singlefile = determine_datatype(self.path)
if self.datatype != 'plaintext':
raise ValueError(
'parse method can only be used on plaintext corpora.')
kwargs.pop('parse', None)
kwargs.pop('tokenise', None)
return Corpus(
make_corpus(
self.path,
parse=False,
tokenise=True,
*args,
**kwargs))
[docs] def concordance(self, *args, **kwargs):
"""
A concordance method for Tregex queries, CoreNLP dependencies,
tokenised data or plaintext.
:Example:
>>> wv = ['want', 'need', 'feel', 'desire']
>>> corpus.concordance({L: wv, F: 'root'})
0 01 1-01.txt.xml But , so I feel like i do that for w
1 01 1-01.txt.xml I felt a little like oh , i
2 01 1-01.txt.xml he 's a difficult man I feel like his work ethic
3 01 1-01.txt.xml So I felt like i recognized li
... ...
Arguments are the same as :func:`~corpkit.interrogation.Interrogation.interrogate`, plus:
:param only_format_match: if True, left and right window will just be words, regardless of
what is in ``show``
:type only_format_match: bool
:param only_unique: only unique lines
:type only_unique: bool
:returns: A :class:`corpkit.interrogation.Concordance` instance
"""
from corpkit.interrogator import interrogator
kwargs.pop('do_concordancing', None)
kwargs.pop('conc', None)
kwargs.pop('corpus', None)
return interrogator(self, do_concordancing='only', *args, **kwargs)
[docs] def interroplot(self, search, **kwargs):
"""Interrogate, relativise, then plot, with very little customisability. A demo function.
:Example:
>>> corpus.interroplot(r'/NN.?/ >># NP')
<matplotlib figure>
:param search: search as per :func:`~corpkit.corpus.Corpus.interrogate`
:type search: `dict`
:param kwargs: extra arguments to pass to :func:`~corpkit.corpus.Corpus.visualise`
:type kwargs: keyword arguments
:returns: None (but show a plot)
"""
if isinstance(search, basestring):
search = {'t': search}
interro = self.interrogate(search=search, show=kwargs.pop('show', 'w'))
edited = interro.edit('%', 'self', print_info=False)
edited.visualise(self.name, **kwargs).show()
[docs] def save(self, savename=False, **kwargs):
"""Save corpus class to file
>>> corpus.save(filename)
:param savename: name for the file
:type savename: str
:returns: None
"""
from corpkit.other import save
if not savename:
savename = self.name
save(self, savename, savedir=kwargs.pop('savedir', 'data'), **kwargs)
[docs]class Subcorpus(Corpus):
"""Model a subcorpus, containing files but no subdirectories.
Methods for interrogating, concordancing and configurations are the same as
:class:`corpkit.corpus.Corpus`."""
def __init__(self, path, datatype):
self.path = path
kwargs = {'print_info': False, 'level': 's', 'datatype': datatype}
Corpus.__init__(self, self.path, **kwargs)
def __str__(self):
return self.path
def __repr__(self):
return "<%s instance: %s>" % (classname(self), self.name)
def __getitem__(self, key):
from corpkit.process import makesafe
if isinstance(key, slice):
# Get the start, stop, and step from the slice
return Datalist([self[ii]
for ii in range(*key.indices(len(self.files)))])
elif isinstance(key, int):
return self.files.__getitem__(makesafe(self.files[key]))
else:
try:
return self.files.__getattribute__(key)
except:
from corpkit.process import is_number
if is_number(key):
return self.__getattribute__('c' + key)
[docs]class File(Corpus):
"""Models a corpus file for reading, interrogating, concordancing"""
def __init__(self, path, dirname, datatype):
import os
from os.path import join, isfile, isdir
self.path = join(dirname, path)
kwargs = {'print_info': False, 'level': 'f', 'datatype': datatype}
Corpus.__init__(self, self.path, **kwargs)
if self.path.endswith('.p'):
self.datatype = 'tokens'
elif self.path.endswith('.xml'):
self.datatype = 'parse'
else:
self.datatype = 'plaintext'
def __repr__(self):
return "<%s instance: %s>" % (classname(self), self.name)
def __str__(self):
return self.path
@lazyprop
def document(self):
"""Return the parsed XML of a parsed file"""
from corenlp_xml.document import Document
return Document(self.read())
[docs] def read(self, **kwargs):
"""Read file data. If data is pickled, unpickle first
:returns: str/unpickled data
"""
if self.datatype == 'tokens':
import pickle
with open(self.path, "rb", **kwargs) as openfile:
data = pickle.load(openfile)
return data
else:
with open(self.path, 'r', **kwargs) as openfile:
data = openfile.read()
return data
[docs]class Datalist(object):
"""
A list-like object containing subcorpora or corpus files.
Objects can be accessed as attributes, dict keys or by indexing/slicing.
Methods for interrogating, concordancing and getting configurations are the
same as for :class:`corpkit.corpus.Corpus`
"""
def __init__(self, data):
import re
import os
from os.path import join, isfile, isdir
from corpkit.process import makesafe
self.current = 0
if data:
self.high = len(data)
else:
self.high = 0
self.data = data
if data and len(data) > 0:
for subcorpus in data:
safe_var = makesafe(subcorpus)
setattr(self, safe_var, subcorpus)
def __str__(self):
stringform = []
for i in self.data:
stringform.append(i.name)
return '\n'.join(stringform)
def __repr__(self):
return "<%s instance: %d items>" % (classname(self), len(self))
def __delitem__(self, key):
self.__delattr__(key)
def __getitem__(self, key):
from corpkit.process import makesafe
if isinstance(key, slice):
# Get the start, stop, and step from the slice
return Datalist([self[ii]
for ii in range(*key.indices(len(self)))])
elif isinstance(key, int):
return self.__getitem__(makesafe(self.data[key]))
else:
try:
return self.__getattribute__(key)
except:
from corpkit.process import is_number
if is_number(key):
return self.__getattribute__('c' + key)
def __setitem__(self, key, value):
from corpkit.process import makesafe, is_number
if key.startswith('c') and len(key) > 1 and all(
is_number(x) for x in key[1:]):
self.__setattr__(key.lstrip('c'), value)
else:
self.__setattr__(key, value)
def __iter__(self):
for datum in self.data:
yield datum
def __len__(self):
return len(self.data)
def __next__(self): # Python 3: def __next__(self)
if self.current > self.high:
raise StopIteration
else:
self.current += 1
return self.current - 1
[docs] def interrogate(self, *args, **kwargs):
"""Interrogate the corpus using :func:`~corpkit.corpus.Corpus.interrogate`"""
from corpkit.interrogator import interrogator
return interrogator(self, *args, **kwargs)
[docs] def concordance(self, *args, **kwargs):
"""Concordance the corpus using :func:`~corpkit.corpus.Corpus.concordance`"""
from corpkit.interrogator import interrogator
return interrogator(self, do_concordancing='only', *args, **kwargs)
[docs] def configurations(self, search, **kwargs):
"""Get a configuration using :func:`~corpkit.corpus.Corpus.configurations`"""
from corpkit.configurations import configurations
return configurations(self, search, **kwargs)
[docs]class Corpora(Datalist):
"""
Models a collection of Corpus objects. Methods are available for
interrogating and plotting the entire collection. This is the highest level
of abstraction available.
:param data: Corpora to model
:type data: `str` (path containing corpora), `list` (of corpus paths/Corpus
objects)
:class:`corpkit.corpus.Corpus` objects)
"""
def __init__(self, data=False, **kwargs):
# if no arg, load every corpus in data dir
if not data:
data = 'data'
# handle a folder containing corpora
if isinstance(data, str) or isinstance(data, str):
import os
from os.path import join, isfile, isdir
if not os.path.isdir(data):
raise ValueError('Corpora(str) needs to point to a directory.')
data = sorted([join(data, d) for d in os.listdir(data)
if isdir(join(data, d))])
# otherwise, make a list of Corpus objects
for index, i in enumerate(data):
if isinstance(i, str):
data[index] = Corpus(i, **kwargs)
# now turn it into a Datalist
Datalist.__init__(self, data)
def __repr__(self):
return "<%s instance: %d items>" % (classname(self), len(self))
def __getitem__(self, key):
"""allow slicing, indexing"""
from corpkit.process import makesafe
if isinstance(key, slice):
# Get the start, stop, and step from the slice
return Corpora([self[ii] for ii in range(*key.indices(len(self)))])
elif isinstance(key, int):
return self.__getitem__(makesafe(self.data[key]))
else:
try:
return self.__getattribute__(key)
except:
from corpkit.process import is_number
if is_number(key):
return self.__getattribute__('c' + key)
@lazyprop
def features(self):
"""Generate features attribute for all corpora"""
for corpus in self:
corpus.features
@lazyprop
def postags(self):
"""Generate postags attribute for all corpora"""
for corpus in self:
corpus.postags
@lazyprop
def wordclasses(self):
"""Generate wordclasses attribute for all corpora"""
for corpus in self:
corpus.wordclasses