Source code for corpkit.corpus

from __future__ import print_function

"""
corpkit: Corpus and Corpus-like objects
"""

from lazyprop import lazyprop
from corpkit.process import classname

[docs]class Corpus(object):
    """
    A class representing a linguistic text corpus, which contains files,
    optionally within subcorpus folders.

    Methods for concordancing, interrogating, getting general stats, getting
    behaviour of particular word, etc.
    """

    def __init__(self, path, **kwargs):
        import re
        import operator
        import glob
        import os
        from os.path import join, isfile, isdir, abspath, dirname, basename

        from corpkit.process import determine_datatype

        # levels are 'c' for corpus, 's' for subcorpus and 'f' for file. Which
        # one is determined automatically below, and processed accordingly. We
        # assume it is a full corpus to begin with.

        self.data = None

        level = kwargs.pop('level', 'c')
        self.datatype = kwargs.pop('datatype', None)
        print_info = kwargs.get('print_info', True)

        if isinstance(path, Datalist) or isinstance(path, list):
            self.path = abspath(dirname(path[0].path.rstrip('/')))
            self.name = basename(self.path)
            self.data = path
        else:
            self.path = abspath(path)
            self.name = basename(path)

        # this messy code figures out as quickly as possible what the datatype
        # and singlefile status of the path is. it's messy because it shortcuts
        # full checking where possible some of the shortcutting could maybe be
        # moved into the determine_datatype() funct.

        self.singlefile = False
        if os.path.isfile(self.path):
            if self.path.endswith('.xml'):
                self.datatype = 'parse'
            self.singlefile = True
        elif self.path.endswith('-parsed'):
            if not isdir(self.path):
                if isdir(join('data', path)):
                    self.path = abspath(join('data', path))
            self.datatype = 'parse'
            if len([d for d in os.listdir(self.path)
                    if isdir(join(self.path, d))]) > 0:
                self.singlefile = False
            if len([d for d in os.listdir(self.path)
                    if isdir(join(self.path, d))]) == 0:
                level = 's'
        else:
            if level == 'c':
                if not self.datatype:
                    self.datatype, self.singlefile = determine_datatype(
                        self.path)
            if len([d for d in os.listdir(self.path)
                    if isdir(join(self.path, d))]) == 0:
                level = 's'

        # if initialised on a file, process as file
        if self.singlefile and level == 'c':
            level = 'f'

        self.level = level

        # load each interrogation as an attribute
        if kwargs.get('load_saved', False):
            from corpkit.other import load
            from corpkit.process import makesafe
            if os.path.isdir('saved_interrogations'):
                saved_files = glob.glob(r'saved_interrogations/*')
                for filepath in saved_files:
                    filename = os.path.basename(filepath)
                    if not filename.startswith(self.name):
                        continue
                    not_filename = filename.replace(self.name + '-', '')
                    not_filename = os.path.splitext(not_filename)[0]
                    if not_filename in ['features', 'wordclasses', 'postags']:
                        continue
                    variable_safe = makesafe(not_filename)
                    try:
                        setattr(self, variable_safe, load(filename))
                        if print_info:
                            print(
                                '\tLoaded %s as %s attribute.' %
                                (filename, variable_safe))
                    except AttributeError:
                        if print_info:
                            print(
                                '\tFailed to load %s as %s attribute. Name conflict?' %
                                (filename, variable_safe))

        if print_info:
            print('Corpus: %s' % self.path)

    # these two are duplicated from the file object. not good.
    @lazyprop
    def document(self):
        """Return the parsed XML of a parsed file"""
        if self.level != 'f':
            raise ValueError('Can only access document for File')
        from corenlp_xml.document import Document
        return Document(self.read())

[docs]    def read(self, **kwargs):
        """Read file data. If data is pickled, unpickle first

        :returns: str/unpickled data
        """
        if self.level != 'f':
            raise ValueError('Can only call read method on File')
        if self.datatype == 'tokens':
            import pickle
            with open(self.path, "rb", **kwargs) as openfile:
                data = pickle.load(openfile)
            return data
        else:
            with open(self.path, 'r', **kwargs) as openfile:
                data = openfile.read()
            return data

    @lazyprop
    def subcorpora(self):
        """A list-like object containing a corpus' subcorpora."""
        import re
        import os
        import operator
        from os.path import join, isdir
        if self.data.__class__ == Datalist or isinstance(self.data, list):
            return self.data
        if self.level == 'c':
            variable_safe_r = re.compile(r'[\W0-9_]+', re.UNICODE)
            sbs = Datalist(sorted([Subcorpus(join(self.path, d), self.datatype)
                                   for d in os.listdir(self.path)
                                   if isdir(join(self.path, d))],
                                  key=operator.attrgetter('name')))
            for subcorpus in sbs:
                variable_safe = re.sub(variable_safe_r, '',
                                       subcorpus.name.lower().split(',')[0])
                setattr(self, variable_safe, subcorpus)
            return sbs

    @lazyprop
    def speakerlist(self):
        """A list of speakers in the corpus"""
        from corpkit.build import get_speaker_names_from_xml_corpus
        return get_speaker_names_from_xml_corpus(self.path)

    @lazyprop
    def files(self):
        """A list-like object containing the files in a folder

        >>> corpus.subcorpora[0].files

        """
        import re
        import os
        import operator
        from os.path import join, isdir
        if self.level == 's':

            fls = [f for f in os.listdir(self.path) if not f.startswith('.')]
            fls = [File(f, self.path, self.datatype) for f in fls]
            fls = sorted(fls, key=operator.attrgetter('name'))
            return Datalist(fls)

    def __str__(self):
        """String representation of corpus"""
        show = 'Corpus at %s:\n\nData type: %s\nNumber of subcorpora: %d\n' % (
            self.path, self.datatype, len(self.subcorpora))
        if self.singlefile:
            show += '\nCorpus is a single file.\n'
        if 'features' in self.__dict__.keys():
            if not self.singlefile:
                cols = list(self.features.columns)[:10]
                show += '\nFeatures:\n\n' + \
                    self.features.head(10).to_string(columns=cols)
            else:
                show += '\nFeatures:\n\n' + \
                    self.features.head(10).to_string()
        else:
            show += '\nFeatures not analysed yet. Use .features to calculate them.\n'
        return show

    def __repr__(self):
        """object representation of corpus"""
        import os
        if not self.subcorpora:
            ssubcorpora = ''
        else:
            ssubcorpora = self.subcorpora
        return "<%s instance: %s; %d subcorpora>" % (
            classname(self), os.path.basename(self.path), len(ssubcorpora))

    def __getitem__(self, key):
        from corpkit.process import makesafe
        if isinstance(key, slice):
            # Get the start, stop, and step from the slice
            return Datalist([self[ii] for ii in range(
                *key.indices(len(self.subcorpora)))])
        elif isinstance(key, int):
            return self.subcorpora.__getitem__(makesafe(self.subcorpora[key]))
        else:
            try:
                return self.subcorpora.__getattribute__(key)
            except:
                from corpkit.process import is_number
                if is_number(key):
                    return self.__getattribute__('c' + key)

    # METHODS
    @lazyprop
    def features(self):
        """
        Generate and show basic stats from the corpus, including number of 
        sentences, clauses, process types, etc.

        :Example:

        >>> corpus.features
            SB  Characters  Tokens  Words  Closed class words  Open class words  Clauses
            01       26873    8513   7308                4809              3704     2212
            02       25844    7933   6920                4313              3620     2270
            03       18376    5683   4877                3067              2616     1640
            04       20066    6354   5366                3587              2767     1775

        """
        import os
        from os.path import isfile, isdir, join
        from corpkit.interrogator import interrogator
        from corpkit.other import load
        from corpkit.dictionaries import mergetags

        savedir = 'saved_interrogations'
        if isfile(join(savedir, self.name + '-features.p')):
            try:
                return load(self.name + '-features').results
            except AttributeError:
                return load(self.name + '-features')
        else:
            feat = interrogator(self, 's', 'any').results
            if isdir(savedir):
                feat.save(self.name + '-features')
            return feat

    @lazyprop
    def wordclasses(self):
        """
        Generate and show basic stats from the corpus, including number of 
        sentences, clauses, process types, etc.

        :Example:

        >>> corpus.wordclasses
            SB  Characters  Tokens  Words  Closed class words  Open class words  Clauses
            01       26873    8513   7308                4809              3704     2212
            02       25844    7933   6920                4313              3620     2270
            03       18376    5683   4877                3067              2616     1640
            04       20066    6354   5366                3587              2767     1775

        """
        import os
        from os.path import isfile, isdir, join
        from corpkit.interrogator import interrogator
        from corpkit.other import load
        from corpkit.dictionaries import mergetags

        savedir = 'saved_interrogations'
        if isfile(join(savedir, self.name + '-wordclasses.p')):
            try:
                return load(self.name + '-wordclasses').results
            except AttributeError:
                return load(self.name + '-wordclasses')
        elif isfile(join(savedir, self.name + '-postags.p')):
            try:
                posdata = load(self.name + '-postags').results
            except AttributeError:
                posdata = load(self.name + '-postags')
            return posdata.edit(
                merge_entries=mergetags,
                sort_by='total').results
        else:
            feat = interrogator(self, 't', 'any', show='pl').results
            if isdir(savedir):
                feat.save(self.name + '-wordclasses')
            return feat

    @lazyprop
    def postags(self):
        """
        Generate and show basic stats from the corpus, including number of 
        sentences, clauses, process types, etc.

        :Example:

        >>> corpus.postags
            SB  NN  VB  JJ  IN DT wo Open class words  Clauses
            01       26873    8513   7308                4809              3704     2212
            02       25844    7933   6920                4313              3620     2270
            03       18376    5683   4877                3067              2616     1640
            04       20066    6354   5366                3587              2767     1775

        """
        import os
        from os.path import isfile, isdir, join
        from corpkit.interrogator import interrogator
        from corpkit.other import load
        from corpkit.dictionaries import mergetags

        savedir = 'saved_interrogations'
        if isfile(join(savedir, self.name + '-postags.p')):
            try:
                return load(self.name + '-postags').results
            except AttributeError:
                return load(self.name + '-postags')
        else:
            feat = interrogator(self, 't', 'any', show='p').results
            if isdir(savedir):
                feat.save(self.name + '-postags')
                wordclss = feat.edit(
                    merge_entries=mergetags,
                    sort_by='total').results
                wordclss.save(self.name + '-wordclasses')
            return feat

[docs]    def configurations(self, search, **kwargs):
        """
        Get the overall behaviour of tokens or lemmas matching a regular 
        expression. The search below makes DataFrames containing the most 
        common subjects, objects, modifiers (etc.) of 'see':

        :param search: Similar to `search` in the `interrogate()` / 
        `concordance() methods. `W`/`L keys match word or lemma; `F`: key 
        specifies semantic role (`'participant'`, `'process'` or `'modifier'`. 
        If `F` not specified, each role will be searched for.
        :type search: dict

        :Example:

        >>> see = corpus.configurations({L: 'see', F: 'process'}, show = L)
        >>> see.has_subject.results.sum()
            i           452
            it          227
            you         162
            we          111
            he           94

        :returns: :class:`corpkit.interrogation.Interrodict`
        """

        from corpkit.configurations import configurations
        return configurations(self, search, **kwargs)

[docs]    def interrogate(self, search, *args, **kwargs):
        """
        Interrogate a corpus of texts for a lexicogrammatical phenomenon.

        This method iterates over the files/folders in a corpus, searching the
        texts, and returning a :class:`corpkit.interrogation.Interrogation`
        object containing the results. The main options are `search`, where you
        specify search criteria, and `show`, where you specify what you want to
        appear in the output.

        :Example:

        >>> corpus = Corpus('data/conversations-parsed')
        ### show lemma form of nouns ending in 'ing'
        >>> q = {W: r'ing$', P: r'^N'}
        >>> data = corpus.interrogate(q, show = L)
        >>> data.results
            ..  something  anything  thing  feeling  everything  nothing  morning
            01         14        11     12        1           6        0        1
            02         10        20      4        4           8        3        0
            03         14         5      5        3           1        0        0
            ...                                                               ...

        :param search: What the query should be matching.
           - t: tree
           - w: word
           - l: lemma
           - p: pos
           - f: function
           - g/gw: governor
           - gl: governor's lemma form
           - gp: governor's pos
           - gf: governor's function
           - d/dependent
           - dl: dependent's lemma form
           - dp: dependent's pos
           - df: dependent's function
           - i/index
           - n/ngrams (deprecated, use ``show``)
           - s/general stats
        :type search: str or dict. dict is used when you have multiple criteria.
        Keys are what to search as `str`, and values are the criteria, which is
        a Tregex query, a regex, or a list of words to match. Therefore, the two
        syntaxes below do the same thing:

        :Example:

        >>> corpus.interrogate(T, r'/NN.?/')
        >>> corpus.interrogate({T: r'/NN.?/'})

        :param searchmode: Return results matching any/all criteria
        :type searchmode: str -- `'any'`/`'all'`

        :param exclude: The inverse of `search`, removing results from search
        :type exclude: dict -- `{L: 'be'}`

        :param excludemode: Exclude results matching any/all criteria
        :type excludemode: str -- `'any'`/`'all'`

        :param query: A search query for the interrogation. This is only used
        when `search` is a string, or when multiprocessing. If `search` is a
        `dict`, the query/queries are stored there as the values instead. When
        multiprocessing, the following is possible:

        :Example:

        >>> {'Nouns': r'/NN.?/', 'Verbs': r'/VB.?/'}
        ### return an :class:`corpkit.interrogation.Interrodict` object:
        >>> corpus.interrogate(T, q)
        ### return an :class:`corpkit.interrogation.Interrogation` object:
        >>> corpus.interrogate(T, q, show = C)

        :type query:
           - str -- regex/Tregex pattern (use when `search` is a `str`)
           - dict -- `{name: pattern}` (as per example above)
           - list -- word list to match

        :param show: What to output. If multiple strings are passed in as a ``list``, results
        will be colon-separated, in the suppled order. If you want to show ngrams, you can't
        have multiple values. Possible values are the same as those for ``search``, plus:

           - a/distance from root
           - n/ngram
           - nl/ngram lemma
           - np/ngram POS
           - npl/ngram wordclass

        :type show: ``str``/``list`` of strings

        :param lemmatise: Force lemmatisation on results. Deprecated:
        instead, output a lemma form with the `show` argument
        :type lemmatise: bool

        :param lemmatag: Explicitly pass a pos to lemmatiser (generally when data is unparsed),
        or when tag cannot be recovered from Tregex query
        :type lemmatag: False/'n'/'v'/'a'/'r'

        :param spelling: Convert all to U.S. or U.K. English
        :type spelling: False/'US'/'UK'

        :param dep_type: The kind of Stanford CoreNLP dependency parses you want to use
        :type dep_type: str -- 'basic-dependencies'/'a',
        'collapsed-dependencies'/'b', 'collapsed-ccprocessed-dependencies'/'c'

        :param save: Save result as pickle to `saved_interrogations/<save>` on completion
        :type save: str

        :param gramsize: size of n-grams (default 2)
        :type gramsize: int

        :param split_contractions: make `"don't"` et al into two tokens
        :type split_contractions: bool

        :param multiprocess: how many parallel processes to run
        :type multiprocess: int / bool (to determine automatically)

        :param files_as_subcorpora: treat each file as a subcorpus
        :type files_as_subcorpora: bool

        :param do_concordancing: Concordance while interrogating, store as `.concordance` attribute
        :type do_concordancing: bool/'only'

        :param maxconc: Maximum number of concordance lines
        :type maxconc: int

        :returns: A :class:`corpkit.interrogation.Interrogation` object, with 
        `.query`, `.results`, `.totals` attributes. If multiprocessing is 
        invoked, result may be a :class:`corpkit.interrogation.Interrodict` 
        containing corpus names, queries or speakers as keys.
        """
        from corpkit.interrogator import interrogator
        par = kwargs.pop('multiprocess', None)
        kwargs.pop('corpus', None)
        if par and self.subcorpora:
            if isinstance(par, int):
                kwargs['multiprocess'] = par
            return interrogator(self.subcorpora, search, *args, **kwargs)
        else:
            return interrogator(self, search, *args, **kwargs)

[docs]    def parse(self, corenlppath=False, operations=False, copula_head=True,
              speaker_segmentation=False, memory_mb=False, multiprocess=False,
              split_texts=400, *args, **kwargs):
        """
        Parse an unparsed corpus, saving to disk

        :param corenlppath: folder containing corenlp jar files (use if *corpkit* can't find
        it automatically)
        :type corenlppath: str

        :param operations: which kinds of annotations to do
        :type operations: str

        :param speaker_segmentation: add speaker name to parser output if your corpus is 
        script-like
        :type speaker_segmentation: bool

        :param memory_mb: Amount of memory in MB for parser
        :type memory_mb: int

        :param copula_head: Make copula head in dependency parse
        :type copula_head: bool

        :param multiprocess: Split parsing across n cores (for high-performance computers)
        :type multiprocess: int

        :Example:

        >>> parsed = corpus.parse(speaker_segmentation = True)
        >>> parsed
        <corpkit.corpus.Corpus instance: speeches-parsed; 9 subcorpora>

        :returns: The newly created :class:`corpkit.corpus.Corpus`
        """
        from corpkit.make import make_corpus
        #from corpkit.process import determine_datatype
        #dtype, singlefile = determine_datatype(self.path)
        if self.datatype != 'plaintext':
            raise ValueError(
                'parse method can only be used on plaintext corpora.')
        kwargs.pop('parse', None)
        kwargs.pop('tokenise', None)
        return Corpus(
            make_corpus(
                self.path,
                parse=True,
                tokenise=False,
                corenlppath=corenlppath,
                operations=operations,
                copula_head=copula_head,
                speaker_segmentation=speaker_segmentation,
                memory_mb=memory_mb,
                multiprocess=multiprocess,
                split_texts=400,
                *args,
                **kwargs))

[docs]    def tokenise(self, *args, **kwargs):
        """
        Tokenise a plaintext corpus, saving to disk

        :param nltk_data_path: path to tokeniser if not found automatically
        :type nltk_data_path: str

        :Example:

        >>> tok = corpus.tokenise()
        >>> tok
        <corpkit.corpus.Corpus instance: speeches-tokenised; 9 subcorpora>

        :returns: The newly created :class:`corpkit.corpus.Corpus`
        """

        from corpkit.make import make_corpus
        #from corpkit.process import determine_datatype
        #dtype, singlefile = determine_datatype(self.path)
        if self.datatype != 'plaintext':
            raise ValueError(
                'parse method can only be used on plaintext corpora.')
        kwargs.pop('parse', None)
        kwargs.pop('tokenise', None)

        return Corpus(
            make_corpus(
                self.path,
                parse=False,
                tokenise=True,
                *args,
                **kwargs))

[docs]    def concordance(self, *args, **kwargs):
        """
        A concordance method for Tregex queries, CoreNLP dependencies,
        tokenised data or plaintext.

        :Example:

        >>> wv = ['want', 'need', 'feel', 'desire']
        >>> corpus.concordance({L: wv, F: 'root'})
           0   01  1-01.txt.xml                But , so I  feel     like i do that for w
           1   01  1-01.txt.xml                         I  felt     a little like oh , i
           2   01  1-01.txt.xml   he 's a difficult man I  feel     like his work ethic
           3   01  1-01.txt.xml                      So I  felt     like i recognized li
           ...                                                                       ...


        Arguments are the same as :func:`~corpkit.interrogation.Interrogation.interrogate`, plus:

        :param only_format_match: if True, left and right window will just be words, regardless of 
        what is in ``show``
        :type only_format_match: bool

        :param only_unique: only unique lines
        :type only_unique: bool

        :returns: A :class:`corpkit.interrogation.Concordance` instance

        """

        from corpkit.interrogator import interrogator
        kwargs.pop('do_concordancing', None)
        kwargs.pop('conc', None)
        kwargs.pop('corpus', None)
        return interrogator(self, do_concordancing='only', *args, **kwargs)

[docs]    def interroplot(self, search, **kwargs):
        """Interrogate, relativise, then plot, with very little customisability. A demo function.

        :Example:

        >>> corpus.interroplot(r'/NN.?/ >># NP')
        <matplotlib figure>

        :param search: search as per :func:`~corpkit.corpus.Corpus.interrogate`
        :type search: `dict`
        :param kwargs: extra arguments to pass to :func:`~corpkit.corpus.Corpus.visualise`
        :type kwargs: keyword arguments

        :returns: None (but show a plot)
        """
        if isinstance(search, basestring):
            search = {'t': search}
        interro = self.interrogate(search=search, show=kwargs.pop('show', 'w'))
        edited = interro.edit('%', 'self', print_info=False)
        edited.visualise(self.name, **kwargs).show()

[docs]    def save(self, savename=False, **kwargs):
        """Save corpus class to file

           >>> corpus.save(filename)

        :param savename: name for the file
        :type savename: str

        :returns: None
        """
        from corpkit.other import save
        if not savename:
            savename = self.name
        save(self, savename, savedir=kwargs.pop('savedir', 'data'), **kwargs)

[docs]class Subcorpus(Corpus):
    """Model a subcorpus, containing files but no subdirectories.

    Methods for interrogating, concordancing and configurations are the same as
    :class:`corpkit.corpus.Corpus`."""

    def __init__(self, path, datatype):
        self.path = path
        kwargs = {'print_info': False, 'level': 's', 'datatype': datatype}
        Corpus.__init__(self, self.path, **kwargs)

    def __str__(self):
        return self.path

    def __repr__(self):
        return "<%s instance: %s>" % (classname(self), self.name)

    def __getitem__(self, key):

        from corpkit.process import makesafe

        if isinstance(key, slice):
            # Get the start, stop, and step from the slice
            return Datalist([self[ii]
                             for ii in range(*key.indices(len(self.files)))])
        elif isinstance(key, int):
            return self.files.__getitem__(makesafe(self.files[key]))
        else:
            try:
                return self.files.__getattribute__(key)
            except:
                from corpkit.process import is_number
                if is_number(key):
                    return self.__getattribute__('c' + key)


[docs]class File(Corpus):
    """Models a corpus file for reading, interrogating, concordancing"""

    def __init__(self, path, dirname, datatype):
        import os
        from os.path import join, isfile, isdir
        self.path = join(dirname, path)
        kwargs = {'print_info': False, 'level': 'f', 'datatype': datatype}
        Corpus.__init__(self, self.path, **kwargs)
        if self.path.endswith('.p'):
            self.datatype = 'tokens'
        elif self.path.endswith('.xml'):
            self.datatype = 'parse'
        else:
            self.datatype = 'plaintext'

    def __repr__(self):
        return "<%s instance: %s>" % (classname(self), self.name)

    def __str__(self):
        return self.path

    @lazyprop
    def document(self):
        """Return the parsed XML of a parsed file"""
        from corenlp_xml.document import Document
        return Document(self.read())

[docs]    def read(self, **kwargs):
        """Read file data. If data is pickled, unpickle first

        :returns: str/unpickled data
        """

        if self.datatype == 'tokens':
            import pickle
            with open(self.path, "rb", **kwargs) as openfile:
                data = pickle.load(openfile)
                return data
        else:
            with open(self.path, 'r', **kwargs) as openfile:
                data = openfile.read()
                return data


[docs]class Datalist(object):
    """
    A list-like object containing subcorpora or corpus files.

    Objects can be accessed as attributes, dict keys or by indexing/slicing.

    Methods for interrogating, concordancing and getting configurations are the 
    same as for :class:`corpkit.corpus.Corpus`
    """

    def __init__(self, data):
        import re
        import os
        from os.path import join, isfile, isdir
        from corpkit.process import makesafe
        self.current = 0
        if data:
            self.high = len(data)
        else:
            self.high = 0
        self.data = data
        if data and len(data) > 0:
            for subcorpus in data:
                safe_var = makesafe(subcorpus)
                setattr(self, safe_var, subcorpus)

    def __str__(self):
        stringform = []
        for i in self.data:
            stringform.append(i.name)
        return '\n'.join(stringform)

    def __repr__(self):
        return "<%s instance: %d items>" % (classname(self), len(self))

    def __delitem__(self, key):
        self.__delattr__(key)

    def __getitem__(self, key):

        from corpkit.process import makesafe

        if isinstance(key, slice):
            # Get the start, stop, and step from the slice
            return Datalist([self[ii]
                             for ii in range(*key.indices(len(self)))])
        elif isinstance(key, int):
            return self.__getitem__(makesafe(self.data[key]))
        else:
            try:
                return self.__getattribute__(key)
            except:
                from corpkit.process import is_number
                if is_number(key):
                    return self.__getattribute__('c' + key)

    def __setitem__(self, key, value):
        from corpkit.process import makesafe, is_number
        if key.startswith('c') and len(key) > 1 and all(
                is_number(x) for x in key[1:]):
            self.__setattr__(key.lstrip('c'), value)
        else:
            self.__setattr__(key, value)

    def __iter__(self):
        for datum in self.data:
            yield datum

    def __len__(self):
        return len(self.data)

    def __next__(self):  # Python 3: def __next__(self)
        if self.current > self.high:
            raise StopIteration
        else:
            self.current += 1
            return self.current - 1

[docs]    def interrogate(self, *args, **kwargs):
        """Interrogate the corpus using :func:`~corpkit.corpus.Corpus.interrogate`"""
        from corpkit.interrogator import interrogator
        return interrogator(self, *args, **kwargs)

[docs]    def concordance(self, *args, **kwargs):
        """Concordance the corpus using :func:`~corpkit.corpus.Corpus.concordance`"""
        from corpkit.interrogator import interrogator
        return interrogator(self, do_concordancing='only', *args, **kwargs)

[docs]    def configurations(self, search, **kwargs):
        """Get a configuration using :func:`~corpkit.corpus.Corpus.configurations`"""
        from corpkit.configurations import configurations
        return configurations(self, search, **kwargs)


[docs]class Corpora(Datalist):
    """
    Models a collection of Corpus objects. Methods are available for 
    interrogating and plotting the entire collection. This is the highest level 
    of abstraction available.

    :param data: Corpora to model
    :type data: `str` (path containing corpora), `list` (of corpus paths/Corpus 
    objects)
    :class:`corpkit.corpus.Corpus` objects)
    """

    def __init__(self, data=False, **kwargs):

        # if no arg, load every corpus in data dir
        if not data:
            data = 'data'

        # handle a folder containing corpora
        if isinstance(data, str) or isinstance(data, str):
            import os
            from os.path import join, isfile, isdir
            if not os.path.isdir(data):
                raise ValueError('Corpora(str) needs to point to a directory.')
            data = sorted([join(data, d) for d in os.listdir(data)
                           if isdir(join(data, d))])
        # otherwise, make a list of Corpus objects
        for index, i in enumerate(data):
            if isinstance(i, str):
                data[index] = Corpus(i, **kwargs)

        # now turn it into a Datalist
        Datalist.__init__(self, data)

    def __repr__(self):
        return "<%s instance: %d items>" % (classname(self), len(self))

    def __getitem__(self, key):
        """allow slicing, indexing"""
        from corpkit.process import makesafe
        if isinstance(key, slice):
            # Get the start, stop, and step from the slice
            return Corpora([self[ii] for ii in range(*key.indices(len(self)))])
        elif isinstance(key, int):
            return self.__getitem__(makesafe(self.data[key]))
        else:
            try:
                return self.__getattribute__(key)
            except:
                from corpkit.process import is_number
                if is_number(key):
                    return self.__getattribute__('c' + key)

    @lazyprop
    def features(self):
        """Generate features attribute for all corpora"""
        for corpus in self:
            corpus.features

    @lazyprop
    def postags(self):
        """Generate postags attribute for all corpora"""
        for corpus in self:
            corpus.postags

    @lazyprop
    def wordclasses(self):
        """Generate wordclasses attribute for all corpora"""
        for corpus in self:
            corpus.wordclasses