Source code for corpkit.corpus

"""
corpkit: Corpus and Corpus-like objects
"""

from __future__ import print_function

from lazyprop import lazyprop
from corpkit.process import classname
from corpkit.constants import STRINGTYPE, PYTHON_VERSION

[docs]class Corpus(object):
    """
    A class representing a linguistic text corpus, which contains files,
    optionally within subcorpus folders.

    Methods for concordancing, interrogating, getting general stats, getting
    behaviour of particular word, etc.

    Unparsed, tokenised and parsed corpora use the same class, though some
    methods are available only to one or the other. Only unparsed corpora 
    can be parsed, and only parsed/tokenised corpora can be interrogated.
    """

    def __init__(self, path, **kwargs):
        import re
        import operator
        import glob
        import os
        from os.path import join, isfile, isdir, abspath, dirname, basename
        from corpkit.process import determine_datatype

        # levels are 'c' for corpus, 's' for subcorpus and 'f' for file. Which
        # one is determined automatically below, and processed accordingly. We
        # assume it is a full corpus to begin with.

        def get_symbolics(self):
            return {'skip': self.skip,
                    'just': self.just,
                    'symbolic': self.symbolic}

        self.data = None
        self._dlist = None
        self.level = kwargs.pop('level', 'c')
        self.datatype = kwargs.pop('datatype', None)
        self.print_info = kwargs.pop('print_info', True)
        self.symbolic = kwargs.get('subcorpora', False)
        self.skip = kwargs.get('skip', False)
        self.just = kwargs.get('just', False)
        self.kwa = get_symbolics(self)

        if isinstance(path, (list, Datalist)):
            self.path = abspath(dirname(path[0].path.rstrip('/')))
            self.name = basename(self.path)
            self.data = path
            if self.level == 'd':
                self._dlist = path
        elif isinstance(path, STRINGTYPE):
            self.path = abspath(path)
            self.name = basename(path)
        elif hasattr(path, 'path') and path.path:
            self.path = abspath(path.path)
            self.name = basename(path.path)

        # this messy code figures out as quickly as possible what the datatype
        # and singlefile status of the path is. it's messy because it shortcuts
        # full checking where possible some of the shortcutting could maybe be
        # moved into the determine_datatype() funct.
        
        if self.level == 'd':
            self.singlefile = len(self._dlist) > 1
        else:
            self.singlefile = False
            if os.path.isfile(self.path):
                self.singlefile = True
            else:
                if not isdir(self.path):
                    if isdir(join('data', path)):
                        self.path = abspath(join('data', path))
            
            if self.path.endswith('-parsed') or self.path.endswith('-tokenised'):

                for r, d, f in os.walk(self.path):
                    if not f:
                        continue
                    if isinstance(f, str) and f.startswith('.'):
                        continue
                    if f[0].endswith('conll') or f[0].endswith('conllu'):
                        self.datatype = 'conll'
                        break

                if len([d for d in os.listdir(self.path)
                        if isdir(join(self.path, d))]) > 0:
                    self.singlefile = False
                if len([d for d in os.listdir(self.path)
                        if isdir(join(self.path, d))]) == 0:
                    self.level = 's'
            else:
                if self.level == 'c':
                    if not self.datatype:
                        self.datatype, self.singlefile = determine_datatype(
                            self.path)
                if isdir(self.path):
                    if len([d for d in os.listdir(self.path)
                            if isdir(join(self.path, d))]) == 0:
                        self.level = 's'

            # if initialised on a file, process as file
            if self.singlefile and self.level == 'c':
                self.level = 'f'

            # load each interrogation as an attribute
            if kwargs.get('load_saved', False):
                from corpkit.other import load
                from corpkit.process import makesafe
                if os.path.isdir('saved_interrogations'):
                    saved_files = glob.glob(r'saved_interrogations/*')
                    for filepath in saved_files:
                        filename = os.path.basename(filepath)
                        if not filename.startswith(self.name):
                            continue
                        not_filename = filename.replace(self.name + '-', '')
                        not_filename = os.path.splitext(not_filename)[0]
                        if not_filename in ['features', 'wordclasses', 'postags']:
                            continue
                        variable_safe = makesafe(not_filename)
                        try:
                            setattr(self, variable_safe, load(filename))
                            if self.print_info:
                                print(
                                    '\tLoaded %s as %s attribute.' %
                                    (filename, variable_safe))
                        except AttributeError:
                            if self.print_info:
                                print(
                                    '\tFailed to load %s as %s attribute. Name conflict?' %
                                    (filename, variable_safe))

            if self.print_info:
                print('Corpus: %s' % self.path)

    @lazyprop
    def subcorpora(self):
        """
        A list-like object containing a corpus' subcorpora.
        """
        import re
        import os
        import operator
        from os.path import join, isdir
        if self.level == 'd':
            return
        if self.data.__class__ == Datalist or isinstance(self.data, (Datalist, list)):
            return self.data
        if self.level == 'c':
            variable_safe_r = re.compile(r'[\W0-9_]+', re.UNICODE)
            sbs = Datalist(sorted([Subcorpus(join(self.path, d), self.datatype, **self.kwa)
                                   for d in os.listdir(self.path)
                                   if isdir(join(self.path, d))],
                                  key=operator.attrgetter('name')), **self.kwa)
            for subcorpus in sbs:
                variable_safe = re.sub(variable_safe_r, '',
                                       subcorpus.name.lower().split(',')[0])
                setattr(self, variable_safe, subcorpus)
            return sbs

    @lazyprop
    def speakerlist(self):
        """
        A list of speakers in the corpus
        """
        from corpkit.build import get_speaker_names_from_parsed_corpus
        return get_speaker_names_from_parsed_corpus(self)

    @lazyprop
    def files(self):
        """
        A list-like object containing the files in a folder

        >>> corpus.subcorpora[0].files
        """

        import re
        import os
        import operator
        from os.path import join, isdir
        if self.level == 's':
            fls = [f for f in os.listdir(self.path) if not f.startswith('.')]
            fls = [File(f, self.path, self.datatype, **self.kwa) for f in fls]
            fls = sorted(fls, key=operator.attrgetter('name'))
            return Datalist(fls, **self.kwa)
        elif self.level == 'd':
            return self._dlist

    @lazyprop
    def all_filepaths(self):
        """
        Lazy-load a list of all filepaths in a corpus
        """
        if self.level == 'f':
            return [self.path]
        if self.files:
            return [i.path for i in self.files]
        fs = []
        for sc in self.subcorpora:
            for f in sc.files:
                fs.append(f.path)
        return fs

[docs]    def conll_conform(self, errors='raise'):
        """
        This removes sent index column from old corpkit data
        """
        from corpkit.constants import OPENER
        fs = self.all_filepaths
        for i, f in enumerate(fs, start=1):
            badfile = False
            print('Doing %s/%s' % (i, len(fs)))
            fdata = []
            with OPENER(f, 'r', encoding='utf-8') as fo:
                lines = fo.read().splitlines()
                for line in lines:
                    if line.startswith('# sent_id'):
                        continue
                    if line and not line.startswith('#'):
                        try:
                            splut = line.split('\t', 1)[1]
                        except IndexError:
                            raise IndexError('Failed on file %s' % f)
                        if not splut[0].isdigit():
                            if errors == 'raise':
                                raise ValueError('File %s does not appear to be in old format.' % f)
                            else:
                                if badfile:
                                    continue
                                badfile = True
                                continue
                        else:
                            line = splut
                        # add v column with nothing in it
                        line = line.split('\t')
                        line.insert(5, '_')
                        line = '\t'.join(line)
                    fdata.append(line)
                if badfile and errors != 'raise':
                    print('Skipping %s' % f)
                    continue
            with OPENER(f, 'w', encoding='utf-8') as fo:
                fo.write('\n'.join(fdata))

    @lazyprop
    def all_files(self):
        """
        Lazy-load a list of all filepaths in a corpus
        """
        if self.level == 'f':
            return Datalist([self])
        if self.files:
            return self.files
        fs = []
        for sc in self.subcorpora:
            for f in sc.files:
                fs.append(f)
        return Datalist(fs)

[docs]    def tfidf(self, search={'w': 'any'}, show=['w'], **kwargs):
        """
        Generate TF-IDF vector representation of corpus
        using interrogate method. All args and kwargs go to 
        :func:`~corpkit.corpus.Corpus.interrogate`.

        :returns: Tuple: the vectoriser and matrix
        """

        from sklearn.feature_extraction.text import TfidfVectorizer
        vectoriser = TfidfVectorizer(input='content',
                                     tokenizer=lambda x: x.split())

        res = self.interrogate(search=search,
                               show=show,
                               **kwargs).results

        # there is also a string repeat method which could be better
        def dupe_string(line):
            """Duplicate line name by line count and return string"""
            return ''.join([(w + ' ') * line[w] for w in line.index])

        ser = res.apply(dupe_string, axis=1)
        vec = vectoriser.fit_transform(ser.values)
        #todo: subcorpora names are lost?
        return vectoriser, vec


    def __str__(self):
        """
        String representation of corpus
        """
        showing = 'subcorpora'
        if getattr(self, 'subcorpora', False):
            sclen = len(self.subcorpora)
        else:
            showing = 'files'
            sclen = len(self.files)

        show = 'Corpus at %s:\n\nData type: %s\nNumber of %s: %d\n' % (
            self.path, self.datatype, showing, sclen)
        val = self.symbolic if self.symbolic else 'default'
        show += 'Subcorpora: %s\n' % val
        if self.singlefile:
            show += '\nCorpus is a single file.\n'
        #if getattr(self, 'symbolic'):
        #    show += 'Symbolic subcorpora: %s\n' % str(self.symbolic)
        if getattr(self, 'skip'):
            show += 'Skip: %s\n' % str(self.skip)
        if getattr(self, 'just'):
            show += 'Just: %s\n' % str(self.just)
        return show

    def __repr__(self):
        """
        Object representation of corpus
        """
        import os
        if not self.subcorpora:
            ssubcorpora = ''
        else:
            ssubcorpora = self.subcorpora
        return "<%s instance: %s; %d subcorpora>" % (
            classname(self), os.path.basename(self.path), len(ssubcorpora))

    def __getitem__(self, key):
        """
        Get attributes from corpus
        todo: symbolic stuff for item selection
        """
        from corpkit.constants import STRINGTYPE
        from corpkit.process import makesafe

        if getattr(self, 'subcorpora', False):
            get_from = self.subcorpora

        elif getattr(self, 'files', False):
            get_from = self.files
        
        else:
            get_from = self.document
            try:
                return get_from.loc[key]
            except:
                return get_from.__getitem__(key)

        return get_from.__getitem__(key)

    def __delitem__(self, key):
        from corpkit.constants import STRINGTYPE
        from corpkit.process import makesafe

        if getattr(self, 'subcorpora', False):
            del_from = self.subcorpora

        elif getattr(self, 'files', False):
            del_from = self.files
        
        if isinstance(key, (int, slice)):
            del_from.__delitem__(key)

        elif isinstance(key, STRINGTYPE):
            del_from.__delitem__(del_from.index(key))

    @lazyprop
    def features(self):
        """
        Generate and show basic stats from the corpus, including number of 
        sentences, clauses, process types, etc.

        :Example:

        >>> corpus.features
            SB  Characters  Tokens  Words  Closed class words  Open class words  Clauses
            01       26873    8513   7308                4809              3704     2212
            02       25844    7933   6920                4313              3620     2270
            03       18376    5683   4877                3067              2616     1640
            04       20066    6354   5366                3587              2767     1775

        """
        from corpkit.dictionaries.word_transforms import mergetags
        from corpkit.process import get_corpus_metadata, add_df_to_dotfile, make_df_json_name

        kwa = {'just_metadata': self.just,
               'skip_metadata': self.skip,
               'subcorpora': self.symbolic}

        md = get_corpus_metadata(self.path, generate=True)
        name = make_df_json_name('features', self.symbolic)

        if name in md:
            import pandas as pd
            try:
                return pd.DataFrame(md[name])
            except ValueError:
                return pd.Series(md[name])
        else:
            feat = self.interrogate('features', **kwa)
            from corpkit.interrogation import Interrodict
            if isinstance(feat, Interrodict):
                feat = feat.multiindex()
            feat = feat.results
            add_df_to_dotfile(self.path, feat, typ='features', subcorpora=self.symbolic) 
            return feat

    def _get_postags_and_wordclasses(self):
        """
        Called by corpus.postags and corpus.wordclasses internally
        """
        from corpkit.dictionaries.word_transforms import mergetags
        from corpkit.process import get_corpus_metadata, add_df_to_dotfile, make_df_json_name

        kwa = {'just_metadata': self.just,
               'skip_metadata': self.skip,
               'subcorpora': self.symbolic}

        md = get_corpus_metadata(self.path, generate=True)

        pname = make_df_json_name('postags', self.symbolic)
        wname = make_df_json_name('wordclasses', self.symbolic)
        
        if pname in md and wname in md:
            import pandas as pd
            try:
                return pd.DataFrame(md[pname]), pd.DataFrame(md[wname])
            except ValueError:
                return pd.Series(md[pname]), pd.Series(md[wname])

        else:
            postags = self.interrogate('postags', **kwa)
            from corpkit.interrogation import Interrodict
            if isinstance(postags, Interrodict):
                postags = postags.multiindex()
            wordclasses = postags.edit(merge_entries=mergetags,
                                       sort_by='total').results.astype(int)
            postags = postags.results
            add_df_to_dotfile(self.path, postags, typ='postags', subcorpora=self.symbolic)
            add_df_to_dotfile(self.path, wordclasses, typ='wordclasses', subcorpora=self.symbolic)
            return postags, wordclasses

    @lazyprop
    def wordclasses(self):
        """
        Generate and show basic stats from the corpus, including number of 
        sentences, clauses, process types, etc.

        :Example:

        >>> corpus.wordclasses
            SB   Verb  Noun  Preposition   Determiner ...
            01  26873  8513         7308         5508 ...
            02  25844  7933         6920         3323 ...
            03  18376  5683         4877         3137 ...
            04  20066  6354         5366         4336 ...
        """
        postags, wordclasses = self._get_postags_and_wordclasses()
        return wordclasses

    @lazyprop
    def postags(self):
        """
        Generate and show basic stats from the corpus, including number of 
        sentences, clauses, process types, etc.

        :Example:

        >>> corpus.postags
            SB      NN     VB     JJ     IN     DT 
            01   26873   8513   7308   4809   3704  ...
            02   25844   7933   6920   4313   3620  ...
            03   18376   5683   4877   3067   2616  ...
            04   20066   6354   5366   3587   2767  ...

        """
        postags, wordclasses = self._get_postags_and_wordclasses()
        return postags

    @lazyprop
    def lexicon(self, **kwargs):
        """
        Get a lexicon/frequency distribution from a corpus,
        and save to disk for next time.

        :returns: a `DataFrame` of tokens and counts
        """
        
        from corpkit.process import get_corpus_metadata, add_df_to_dotfile, make_df_json_name

        kwa = {'just_metadata': self.just,
               'skip_metadata': self.skip,
               'subcorpora': self.symbolic}

        md = get_corpus_metadata(self.path, generate=True)
        name = make_df_json_name('lexicon', self.symbolic)
        
        if name in md:
            import pandas as pd
            return pd.DataFrame(md[name])
        else:
            lexi = self.interrogate('lexicon', **kwa)
            from corpkit.interrogation import Interrodict
            if isinstance(lexi, Interrodict):
                lexi = lexi.multiindex()
            lexi = lexi.results
            add_df_to_dotfile(self.path, lexi, typ='lexicon', subcorpora=self.symbolic)
            return lexi

[docs]    def configurations(self, search, **kwargs):
        """
        Get the overall behaviour of tokens or lemmas matching a regular 
        expression. The search below makes DataFrames containing the most 
        common subjects, objects, modifiers (etc.) of 'see':

        :param search: Similar to `search` in the 
                       :func:`~corpkit.corpus.Corpus.interrogate` 
                       method.

                       Valid keys are:

                          - `W`/`L` match word or lemma
                          - `F`: match a semantic role (`'participant'`, `'process'` or 
                            `'modifier'`. If `F` not specified, each role will be 
                            searched for.
        :type search: `dict`

        :Example:

        >>> see = corpus.configurations({L: 'see', F: 'process'}, show=L)
        >>> see.has_subject.results.sum()
            i           452
            it          227
            you         162
            we          111
            he           94

        :returns: :class:`corpkit.interrogation.Interrodict`
        """
        if 'subcorpora' not in kwargs:
            kwargs['subcorpora'] = self.symbolic
        if 'just_metadata' not in kwargs:
            kwargs['just_metadata'] = self.just
        if 'skip_metadata' not in kwargs:
            kwargs['skip_metadata'] = self.skip
        from corpkit.configurations import configurations
        return configurations(self, search, **kwargs)

[docs]    def interrogate(self, search='w', *args, **kwargs):
        """
        Interrogate a corpus of texts for a lexicogrammatical phenomenon.

        This method iterates over the files/folders in a corpus, searching the
        texts, and returning a :class:`corpkit.interrogation.Interrogation`
        object containing the results. The main options are `search`, where you
        specify search criteria, and `show`, where you specify what you want to
        appear in the output.

        :Example:

        >>> corpus = Corpus('data/conversations-parsed')
        ### show lemma form of nouns ending in 'ing'
        >>> q = {W: r'ing$', P: r'^N'}
        >>> data = corpus.interrogate(q, show=L)
        >>> data.results
            ..  something  anything  thing  feeling  everything  nothing  morning
            01         14        11     12        1           6        0        1
            02         10        20      4        4           8        3        0
            03         14         5      5        3           1        0        0
            ...                                                               ...

        :param search: What part of the lexicogrammar to search, and what 
                       criteria to match. The `keys` are the thing to be 
                       searched, and values are the criteria. To search parse 
                       trees, use the `T` key, and a Tregex query as the value.
                       When searching dependencies, you can use any of:

                       +--------------------+-------+----------+-----------+-----------+
                       |                    | Match | Governor | Dependent | Head      |
                       +====================+=======+==========+===========+===========+
                       | Word               | `W`   | `G`      | `D`       | `H`       |
                       +--------------------+-------+----------+-----------+-----------+
                       | Lemma              | `L`   | `GL`     | `DL`      | `HL`      |
                       +--------------------+-------+----------+-----------+-----------+
                       | Function           | `F`   | `GF`     | `DF`      | `HF`      |
                       +--------------------+-------+----------+-----------+-----------+
                       | POS tag            | `P`   | `GP`     | `DP`      | `HP`      |
                       +--------------------+-------+----------+-----------+-----------+
                       | Word class         | `X`   | `GX`     | `DX`      | `HX`      |
                       +--------------------+-------+----------+-----------+-----------+
                       | Distance from root | `A`   | `GA`     | `DA`      | `HA`      |
                       +--------------------+-------+----------+-----------+-----------+
                       | Index              | `I`   | `GI`     | `DI`      | `HI`      |
                       +--------------------+-------+----------+-----------+-----------+
                       | Sentence index     | `S`   | `SI`     | `SI`      | `SI`      |
                       +--------------------+-------+----------+-----------+-----------+

                       Values should be regular expressions or wordlists to 
                       match.

        :type search: `dict`

        :Example:

        >>> corpus.interrogate({T: r'/NN.?/ < /^t/'}) # T- nouns, via trees
        >>> corpus.interrogate({W: '^t': P: r'^v'}) # T- verbs, via dependencies

        :param searchmode: Return results matching any/all criteria
        :type searchmode: `str` -- `'any'`/`'all'`

        :param exclude: The inverse of `search`, removing results from search
        :type exclude: `dict` -- `{L: 'be'}`

        :param excludemode: Exclude results matching any/all criteria
        :type excludemode: `str` -- `'any'`/`'all'`

        :param query: A search query for the interrogation. This is only used
                      when `search` is a `str`, or when multiprocessing. When 
                      `search` If `search` is a str, the search criteria can be 
                      passed in as `query, in order to allow the simpler syntax:

                         >>> corpus.interrogate(GL, '(think|want|feel)')

                      When multiprocessing, the following is possible:

                         >>> q = {'Nouns': r'/NN.?/', 'Verbs': r'/VB.?/'}
                         ### return an :class:`corpkit.interrogation.Interrogation` object with multiindex:
                         >>> corpus.interrogate(T, q)
                         ### return an :class:`corpkit.interrogation.Interrogation` object without multiindex:
                         >>> corpus.interrogate(T, q, show=C)

        :type query: `str`, `dict` or `list`

        :param show: What to output. If multiple strings are passed in as a `list`, 
                     results will be colon-separated, in the suppled order. Possible 
                     values are the same as those for `search`, plus options 
                     n-gramming and getting collocates:

                     +------+-----------------------+------------------------+
                     | Show | Gloss                 | Example                |
                     +======+=======================+========================+
                     | N    |  N-gram word          | `The women were`       |
                     +------+-----------------------+------------------------+
                     | NL   |  N-gram lemma         | `The woman be`         |
                     +------+-----------------------+------------------------+
                     | NF   |  N-gram function      | `det nsubj root`       |
                     +------+-----------------------+------------------------+
                     | NP   |  N-gram POS tag       | `DT NNS VBN`           |
                     +------+-----------------------+------------------------+
                     | NX   |  N-gram word class    | `determiner noun verb` |
                     +------+-----------------------+------------------------+
                     | B    |  Collocate word       | `The_were`             |
                     +------+-----------------------+------------------------+
                     | BL   |  Collocate lemma      | `The_be`               |
                     +------+-----------------------+------------------------+
                     | BF   |  Collocate function   | `det_root`             |
                     +------+-----------------------+------------------------+
                     | BP   |  Collocate POS tag    | `DT_VBN`               |
                     +------+-----------------------+------------------------+
                     | BX   |  Collocate word class | `determiner_verb`      |
                     +------+-----------------------+------------------------+

        :type show: `str`/`list` of strings

        :param lemmatise: Force lemmatisation on results. **Deprecated:
                          instead, output a lemma form with the `show` argument**
        :type lemmatise: `bool`

        :param lemmatag: When using a Tregex/Tgrep query, the tool will
                         attempt to determine the word class of results from the query.
                         Passing in a `str` here will tell the lemmatiser the expected
                         POS of results to lemmatise. It only has an affect if trees
                         are being searched and lemmata are being shown.
        :type lemmatag: `'n'`/`'v'`/`'a'`/`'r'`/`False`

        :param save: Save result as pickle to `saved_interrogations/<save>` on 
                     completion
        :type save: `str`

        :param gramsize: Size of n-grams (default 1, i.e. unigrams)
        :type gramsize: `int`

        :param multiprocess: How many parallel processes to run
        :type multiprocess: `int`/`bool` (`bool` determines automatically)

        :param files_as_subcorpora: (**Deprecated, use subcorpora=files**). Treat each file as a subcorpus, ignoring 
                                    actual subcorpora if present
        :type files_as_subcorpora: `bool`

        :param conc: Generate a concordance while interrogating, 
                                 store as `.concordance` attribute
        :type conc: `bool`/`'only'`

        :param coref: Also get coreferents for search matches
        :type coref: `bool`

        :param tgrep: Use `TGrep` for tree querying. TGrep is less expressive 
                      than Tregex, and is slower, but can work without Java. This
                      option may be turned on internally if Java is not found.
        :type tgrep: `bool`

        :param subcorpora: Use a metadata value as subcorpora. 
                           Passing a list will create a multiindex.
                           `'file'` and `'folder'`/`'default'` are also possible values.
        :type subcorpora: `str`/`list`

        :param just_metadata: One or more metadata fields and criteria to filter sentences by.
                              Only those matching will be kept. Criteria can be a list of words
                              or a regular expression. Passing ``{'speaker': 'ENVER'}``
                              will search only sentences annotated with ``speaker=ENVER``.
        :type just_metadata: `dict`

        :param skip_metadata: A field and regex/list to filter sentences by.
                              The inverse of ``just_metadata``.
        :type skip_metadata: `dict`

        :param discard: When returning many (i.e. millions) of results, memory can be
                        a problem. Setting a discard value will ignore results occurring
                        infrequently in a subcorpus. An ``int`` will remove any result
                        occurring ``n`` times or fewer. A float will remove this proportion
                        of results (i.e. 0.1 will remove 10 per cent)
        :type discard: ``int``/``float``

        :returns: A :class:`corpkit.interrogation.Interrogation` object, with 
                  `.query`, `.results`, `.totals` attributes. If multiprocessing is 
                  invoked, result may be multiindexed.
        """
        from corpkit.interrogator import interrogator
        import pandas as pd
        par = kwargs.pop('multiprocess', None)
        kwargs.pop('corpus', None)

        if self.datatype != 'conll':
            raise ValueError('You need to parse or tokenise the corpus before searching.')
        
        # handle symbolic structures
        subcorpora = kwargs.get('subcorpora', False)
        if self.level == 's':
            subcorpora = 'file'
        if self.symbolic:
            subcorpora = self.symbolic
        if 'subcorpora' in kwargs:
            subcorpora = kwargs.pop('subcorpora')
        if subcorpora in ['default', 'folder', 'folders']:
            subcorpora = False
        if subcorpora in ['file', 'files']:
            subcorpora = False
            kwargs['files_as_subcorpora'] = True

        if self.skip:
            if kwargs.get('skip_metadata'):
                kwargs['skip_metadata'].update(self.skip)
            else:
                kwargs['skip_metadata'] = self.skip

        if self.just:
            if kwargs.get('just_metadata'):
                kwargs['just_metadata'].update(self.just)
            else:
                kwargs['just_metadata'] = self.just

        kwargs.pop('subcorpora', False)

        if par and self.subcorpora:
            if isinstance(par, int):
                kwargs['multiprocess'] = par
            res = interrogator(self.subcorpora, search,
                                subcorpora=subcorpora, *args, **kwargs)
        else:
            kwargs['multiprocess'] = par
            res = interrogator(self, search,
                                subcorpora=subcorpora, *args, **kwargs)

        if kwargs.get('conc', False) == 'only':
            return res

        from corpkit.interrogation import Interrodict
        if isinstance(res, Interrodict) and kwargs.get('use_interrodict'):
            return res
        elif isinstance(res, Interrodict) and not kwargs.get('use_interrodict', False):
            return res.multiindex()
        else:
            if subcorpora:
                res.results.index.name = subcorpora


        # sort by total
        ind = list(res.results.index)
        if isinstance(res.results, pd.DataFrame):
            if not res.results.empty:
                res.results = res.results[list(res.results.sum().sort_values(ascending=False).index)]
                res.results = res.results.astype(int)

            if all(i == 'none' or str(i).isdigit() for i in ind):
                longest = max([len(str(i)) if str(i).isdigit() else 1 for i in ind])
                res.results.index = [str(i).zfill(longest) for i in ind]
                res.results = res.results.sort_index().astype(int)
        else:
            show = res.query.get('show', [])
            outs = []
            from corpkit.constants import transshow, transobjs
            for bit in show:
                name = transobjs.get(bit[0], bit[0]) + '-' + transshow.get(bit[-1], bit[-1])
                name = name.replace('Match-', '').lower()
                outs.append(name)
            name = '/'.join(outs)
            if name:
                res.results.name = name
        return res

[docs]    def sample(self, n, level='f'):
        """
        Get a sample of the corpus

        :param n: amount of data in the the sample. If an ``int``, get n files.
                  if a ``float``, get float * 100 as a percentage of the corpus
        :type n: ``int``/``float``
        :param level: sample subcorpora (``s``) or files (``f``)
        :type level: ``str``
        :returns: a Corpus object
        """
        import random

        if isinstance(n, int):
            if level.lower().startswith('s'):
                rs = random.sample(list(self.subcorpora), n)
                rs = sorted(rs, key=lambda x: x.name)
                return Corpus(Datalist(rs),
                              print_info=False, datatype='conll')
            else:
                fps = list(self.all_files)
                dl = Datalist(random.sample(fps, n))
                return Corpus(dl, level='d',
                              print_info=False, datatype='conll')
        elif isinstance(n, float):
            if level.lower().startswith('s'):
                fps = list(self.subcorpora)
                n = len(fps) / n
                return Corpus(Datalist(random.sample(fps, n)),
                              print_info=False, datatype='conll')
            else:
                fps = list(self.all_files)
                n = len(fps) / n
                return Corpus(Datalist(random.sample(fps, n)), level='d',
                              print_info=False, datatype='conll')

[docs]    def delete_metadata(self):
        """
        Delete metadata for corpus. May be needed if corpus is changed
        """
        import os
        os.remove(os.path.join('data', '.%s.json' % self.name))

    @lazyprop
    def metadata(self):
        """
        Get metadata for a corpus
        """
        from corpkit.process import get_corpus_metadata
        return get_corpus_metadata(self, generate=True)

[docs]    def parse(self,
              corenlppath=False,
              operations=False,
              copula_head=True,
              speaker_segmentation=False,
              memory_mb=False,
              multiprocess=False,
              split_texts=400,
              outname=False,
              metadata=False,
              coref=True,
              *args,
              **kwargs
             ):
        """
        Parse an unparsed corpus, saving to disk

        :param corenlppath: Folder containing corenlp jar files (use if *corpkit* can't find
                            it automatically)
        :type corenlppath: `str`

        :param operations: Which kinds of annotations to do
        :type operations: `str`

        :param speaker_segmentation: Add speaker name to parser output if your
                                     corpus is script-like
        :type speaker_segmentation: `bool`

        :param memory_mb: Amount of memory in MB for parser
        :type memory_mb: `int`

        :param copula_head: Make copula head in dependency parse
        :type copula_head: `bool`

        :param split_texts: Split texts longer than `n` lines for parser memory
        :type split_text: `int`

        :param multiprocess: Split parsing across n cores (for high-performance 
                             computers)
        :type multiprocess: `int`

        :param folderise: If corpus is just files, move each into own folder
        :type folderise: `bool`

        :param output_format: Save parser output as `xml`, `json`, `conll` 
        :type output_format: `str`

        :param outname: Specify a name for the parsed corpus
        :type outname: `str`

        :param metadata: Use if you have XML tags at the end of lines contaning metadata
        :type metadata: `bool`

        :Example:

        >>> parsed = corpus.parse(speaker_segmentation=True)
        >>> parsed
        <corpkit.corpus.Corpus instance: speeches-parsed; 9 subcorpora>

        :returns: The newly created :class:`corpkit.corpus.Corpus`
        """
        import os
        if outname:
            outpath = os.path.join('data', outname)
            if os.path.exists(outpath):
                raise ValueError('Path exists: %s' % outpath)

        from corpkit.make import make_corpus
        #from corpkit.process import determine_datatype
        #dtype, singlefile = determine_datatype(self.path)
        if self.datatype != 'plaintext':
            raise ValueError(
                'parse method can only be used on plaintext corpora.')
        kwargs.pop('parse', None)
        kwargs.pop('tokenise', None)
        kwargs['output_format'] = kwargs.pop('output_format', 'conll')
        corp = make_corpus(unparsed_corpus_path=self.path,
                           parse=True,
                           tokenise=False,
                           corenlppath=corenlppath,
                           operations=operations,
                           copula_head=copula_head,
                           speaker_segmentation=speaker_segmentation,
                           memory_mb=memory_mb,
                           multiprocess=multiprocess,
                           split_texts=split_texts,
                           outname=outname,
                           metadata=metadata,
                           coref=coref,
                           *args,
                           **kwargs)
        if not corp:
            return

        if os.path.isfile(corp):
            return File(corp)
        else:
            return Corpus(corp)

[docs]    def tokenise(self, postag=True, lemmatise=True, *args, **kwargs):
        """
        Tokenise a plaintext corpus, saving to disk

        :param nltk_data_path: Path to tokeniser if not found automatically
        :type nltk_data_path: `str`

        :Example:

        >>> tok = corpus.tokenise()
        >>> tok
        <corpkit.corpus.Corpus instance: speeches-tokenised; 9 subcorpora>

        :returns: The newly created :class:`corpkit.corpus.Corpus`
        """

        from corpkit.make import make_corpus
        #from corpkit.process import determine_datatype
        #dtype, singlefile = determine_datatype(self.path)
        if self.datatype != 'plaintext':
            raise ValueError(
                'parse method can only be used on plaintext corpora.')
        kwargs.pop('parse', None)
        kwargs.pop('tokenise', None)

        c = make_corpus(self.path,
                        parse=False,
                        tokenise=True,
                        postag=postag,
                        lemmatise=lemmatise,
                        *args,
                        **kwargs)
        return Corpus(c)

[docs]    def concordance(self, *args, **kwargs):
        """
        A concordance method for Tregex queries, CoreNLP dependencies,
        tokenised data or plaintext. 

        :Example:

        >>> wv = ['want', 'need', 'feel', 'desire']
        >>> corpus.concordance({L: wv, F: 'root'})
           0   01  1-01.txt.conll                But , so I  feel     like i do that for w
           1   01  1-01.txt.conll                         I  felt     a little like oh , i
           2   01  1-01.txt.conll   he 's a difficult man I  feel     like his work ethic
           3   01  1-01.txt.conll                      So I  felt     like i recognized li
           ...                                                                       ...

        Arguments are the same as :func:`~corpkit.corpus.Corpus.interrogate`, 
        plus a few extra parameters:

        :param only_format_match: If `True`, left and right window will just be
                                  words, regardless of what is in `show`
        :type only_format_match: `bool`

        :param only_unique: Return only unique lines
        :type only_unique: `bool`

        :param maxconc: Maximum number of concordance lines
        :type maxconc: `int`

        :returns: A :class:`corpkit.interrogation.Concordance` instance, with 
                  columns showing filename, subcorpus name, speaker name, left 
                  context, match and right context.
        """

        kwargs.pop('conc', None)
        kwargs.pop('conc', None)
        kwargs.pop('corpus', None)
        return self.interrogate(conc='only', *args, **kwargs)

[docs]    def interroplot(self, search, **kwargs):
        """
        Interrogate, relativise, then plot, with very little customisability.
        A demo function.

        :Example:

        >>> corpus.interroplot(r'/NN.?/ >># NP')
        <matplotlib figure>

        :param search: Search as per :func:`~corpkit.corpus.Corpus.interrogate`
        :type search: `dict`
        :param kwargs: Extra arguments to pass to :func:`~corpkit.corpus.Corpus.visualise`
        :type kwargs: `keyword arguments`

        :returns: `None` (but show a plot)
        """
        if isinstance(search, STRINGTYPE):
            search = {'t': search}
        interro = self.interrogate(search=search, show=kwargs.pop('show', 'w'))
        edited = interro.edit('%', 'self', print_info=False)
        edited.visualise(self.name, **kwargs).show()

[docs]    def save(self, savename=False, **kwargs):
        """
        Save corpus instance to file. There's not much reason to do this, really.

           >>> corpus.save(filename)

        :param savename: Name for the file
        :type savename: `str`

        :returns: `None`
        """
        from corpkit.other import save
        if not savename:
            savename = self.name
        save(self, savename, savedir=kwargs.pop('savedir', 'data'), **kwargs)

[docs]    def make_language_model(self,
                           name,
                           search={'w': 'any'},
                           exclude=False,
                           show=['w', '+1mw'],
                           **kwargs):
        """
        Make a language model for the corpus

        :param name: a name for the model
        :type name: `str`

        :param kwargs: keyword arguments for the interrogate() method
        :type kwargs: `keyword arguments`

        :returns: a :class:`corpkit.model.MultiModel`
        """
        import os
        from corpkit.other import load
        from corpkit.model import MultiModel
        if not name.endswith('.p'):
            namep = name + '.p'
        else:
            namep = name

        # handle symbolic structures
        subcorpora = False
        if self.symbolic:
            subcorpora = self.symbolic
        if kwargs.get('subcorpora', False):
            subcorpora = kwargs.pop('subcorpora')
        kwargs.pop('subcorpora', False)

        jst = kwargs.pop('just_metadata') if 'just_metadata' in kwargs else self.just
        skp = kwargs.pop('skip_metadata') if 'skip_metadata' in kwargs else self.skip
        
        pth = os.path.join('models', namep)
        if os.path.isfile(pth):
            print('Returning saved model: %s' % pth)
            return load(name, loaddir='models')

        # set some defaults if not passed in as kwargs
        #langmod = not any(i.startswith('n') for i in search.keys())

        res = self.interrogate(search,
                               exclude,
                               show,
                               subcorpora=subcorpora,
                               just_metadata=jst,
                               skip_metadata=skp,
                               **kwargs)

        return res.language_model(name, search=search, **kwargs)

[docs]    def annotate(self, conclines, annotation, dry_run=True):
        """
        Annotate a corpus

        :param conclines: a Concordance or DataFrame containing matches to annotate
        :type annotation: Concordance/DataFrame

        :param annotation: a tag or field and value
        :type annotation: ``str``/``dict``
        
        :param dry_run: Show the annotations to be made, but don't do them
        :type dry_run: ``bool``

        :returns: ``None``
        """
        from corpkit.interrogation import Interrogation
        if isinstance(conclines, Interrogation):
            conclines = getattr(conclines, 'concordance', conclines)
        from corpkit.annotate import annotator
        annotator(conclines, annotation, dry_run=dry_run)
        # regenerate metadata afterward---could be a bit slow?
        if not dry_run:
            self.delete_metadata()
            from corpkit.process import make_dotfile
            make_dotfile(self)

[docs]    def unannotate(annotation, dry_run=True):
        """
        Delete annotation from a corpus

        :param annotation: a tag or field and value
        :type annotation: ``str``/``dict``

        :returns: ``None``
        """
        from corpkit.annotate import annotator
        annotator(self, annotation, dry_run=dry_run, deletemode=True)

[docs]class Subcorpus(Corpus):
    """
    Model a subcorpus, containing files but no subdirectories.

    Methods for interrogating, concordancing and configurations are the same as
    :class:`corpkit.corpus.Corpus`.
    """

    def __init__(self, path, datatype, **kwa):
        self.path = path
        kwargs = {'print_info': False, 'level': 's', 'datatype': datatype}
        kwargs.update(kwa)
        self.kwargs = kwargs
        Corpus.__init__(self, self.path, **kwargs)

    def __str__(self):
        return self.path

    def __repr__(self):
        return "<%s instance: %s>" % (classname(self), self.name)

    def __getitem__(self, key):

        from corpkit.process import makesafe

        if isinstance(key, slice):
            # Get the start, stop, and step from the slice
            key = list(key.indices(len(self.files)))
            return Datalist(list(self.files)[slice(*key)])
            #bits = [self[i] for i in range(*key.indices(len(self.files)))]
            #return [self[ii] for ii in range(*key.indices(len(self.files)))])
        elif isinstance(key, int):
            return list(self.files)[key]
        else:
            try:
                return self.files.__getattribute__(key)
            except:
                from corpkit.process import is_number
                if is_number(key):
                    return self.__getattribute__('c' + key)

[docs]class File(Corpus):
    """
    Models a corpus file for reading, interrogating, concordancing.

    Methods for interrogating, concordancing and configurations are the same as
    :class:`corpkit.corpus.Corpus`, plus methods for accessing the file contents 
    directly as a `str`, or as a Pandas DataFrame.
    """

    def __init__(self, path, dirname=False, datatype=False, **kwa):
        import os
        from os.path import join, isfile, isdir
        if dirname:
            self.path = join(dirname, path)
        else:
            self.path = path
        kwargs = {'print_info': False, 'level': 'f', 'datatype': datatype}
        kwargs.update(kwa)
        Corpus.__init__(self, self.path, **kwargs)
        if self.path.endswith('.conll') or self.path.endswith('.conllu'):
            self.datatype = 'conll'
        else:
            self.datatype = 'plaintext'

    def __repr__(self):
        return "<%s instance: %s>" % (classname(self), self.name)

    def __str__(self):
        return self.path
 
[docs]    def read(self, **kwargs):
        """
        Read file data. If data is pickled, unpickle first

        :returns: `str`/unpickled data
        """
        from corpkit.constants import OPENER
        with OPENER(self.path, 'r', **kwargs) as fo:
            return fo.read()

    @lazyprop
    def document(self):
        """
        Return a version of the file that can be manipulated

        * For conll, this is a DataFrame
        * For tokens, this is a list of tokens
        * For plaintext, this is a string
        """
        if self.datatype == 'conll':
            from corpkit.conll import parse_conll
            return parse_conll(self.path)
        else:
            from corpkit.process import saferead
            return saferead(self.path)[0]
    
    @lazyprop
    def trees(self):
        """
        Get an OrderedDict of Tree objects in a File
        """
        if self.datatype == 'conll':
            from nltk import Tree
            from collections import OrderedDict
            return OrderedDict({k: Tree.fromstring(v['parse']) \
                                for k, v in sorted(self.document._metadata.items())})
        else:
            raise AttributeError('Data must be parsed to get trees.')

    @lazyprop
    def plain(self):
        """
        Show the sentences in a File as plaintext
        """
        text = []
        if self.datatype == 'conll':
            doc = self.document
            for sent in list(doc.index.levels[0]):
                text.append('%d: ' % sent + ' '.join(list(doc.loc[sent]['w'])))
        else:
            self.read()
        return '\n'.join(text)

[docs]class Datalist(list):

    def __init__(self, data, **kwargs):

        self.symbolic = kwargs.get('symbolic', False)
        self.just = kwargs.get('just', False)
        self.skip = kwargs.get('skip', False)
        super(Datalist, self).__init__(data)

    def __repr__(self):
        return "<%s instance: %d items>" % (classname(self), len(self))

    def __getattr__(self, key):
        ix = next((i for i, d in enumerate(self) if d.name == key), None)
        if ix is not None:
            return self[ix]

    def __getitem__(self, key):
        from corpkit.constants import STRINGTYPE
        
        if isinstance(key, slice):
            return Datalist([self[i] for i in range(*key.indices(len(self)))])
        
        elif isinstance(key, list):
            if isinstance(key[0], STRINGTYPE):
                dats = [i for i in self if i.name in key]
            else:
                dats = [x for i, x in enumerate(self) if i in key]
            return Datalist(dats)

        elif isinstance(key, int):
            return super(Datalist, self).__getitem__(key)

        elif isinstance(key, STRINGTYPE):
            ix = next((i for i, x in enumerate(self) if x.name == key), None)
            if ix is not None:
                return super(Datalist, self).__getitem__(ix)

    def __delitem__(self, key):
        from corpkit.constants import STRINGTYPE
        if isinstance(key, STRINGTYPE):
            key = next((i for i, d in enumerate(self) if d.name == key), None)
            if key is None:
                return
        super(Datalist, self).__delitem__(key)


[docs]    def interrogate(self, *args, **kwargs):
        """
        Interrogate the corpus using :func:`~corpkit.corpus.Corpus.interrogate`
        """
        
        kwargs['just'] = self.just
        kwargs['skip'] = self.skip
        kwargs['subcorpora'] = self.symbolic

        from corpkit.interrogator import interrogator
        interro = interrogator(self, *args, **kwargs)
        from corpkit.interrogation import Interrodict
        if isinstance(interro, Interrodict):
            interro = interro.multiindex(indexnames=['corpus', 'subcorpus'])
        return interro

[docs]    def concordance(self, *args, **kwargs):
        """
        Concordance the corpus using :func:`~corpkit.corpus.Corpus.concordance`
        """
        kwargs['just'] = self.just
        kwargs['skip'] = self.skip
        kwargs['subcorpora'] = self.symbolic

        from corpkit.interrogator import interrogator
        return interrogator(self, conc='only', *args, **kwargs)

[docs]    def configurations(self, search, **kwargs):
        """
        Get a configuration using :func:`~corpkit.corpus.Corpus.configurations`
        """
        kwargs['just'] = self.just
        kwargs['skip'] = self.skip
        kwargs['subcorpora'] = self.symbolic

        from corpkit.configurations import configurations
        return configurations(self, search, **kwargs)

[docs]class Corpora(Datalist):
    """
    Models a collection of Corpus objects. Methods are available for 
    interrogating and plotting the entire collection. This is the highest level 
    of abstraction available.

    :param data: Corpora to model. A `str` is interpreted as a path containing 
                 corpora. A `list` can be a list of corpus paths or 
                 :class:`corpkit.corpus.Corpus` objects. )
    :type data: `str`/`list`
    """

    def __init__(self, data=False, **kwargs):

        self.name = None

        # if no arg, load every corpus in data dir
        if not data:
            data = 'data'
            
        # handle a folder containing corpora
        if isinstance(data, STRINGTYPE):
            import os
            from os.path import join, isfile, isdir
            if not os.path.isdir(data):
                if not os.path.isdir(os.path.join('data', data)):
                    raise ValueError('Corpora(str) needs to point to a directory.')
                else:
                    data = os.path.join('data', data)
            self.name = os.path.basename(data)
            data = sorted([join(data, d) for d in os.listdir(data)
                           if isdir(join(data, d)) and not d.startswith('.')])

        # otherwise, make a list of Corpus objects

        if not self.name:
            self.name = ','.join([os.path.basename(str(i)) for i in data])
    
        for index, i in enumerate(data):
            if isinstance(i, STRINGTYPE):
                data[index] = Corpus(i, **kwargs)

        # now turn it into a Datalist
        Datalist.__init__(self, data, **kwargs)

    def __repr__(self):
        return "<%s instance: %d items>" % (classname(self), len(self))

[docs]    def parse(self, **kwargs):
        """
        Parse multiple corpora

        :param kwargs: Arguments to pass to the
                       :func:`~corpkit.corpus.Corpus.parse` method.
        :returns: :class:`corpkit.corpus.Corpora`

        """
        from corpkit.corpus import Corpora
        objs = []
        for v in list(self):
            objs.append(v.parse(**kwargs))
        return Corpora(objs)

    ### the below not working yet

    @lazyprop
    def features(self):
        """
        Generate features attribute for all corpora
        """
        from corpkit.interrogation import Interrodict
        feats = []
        for corpus in self:
            feats.append(corpus.features)
        feats = Interrodict(feats)
        return feats.multiindex()

    @lazyprop
    def postags(self):
        """
        Generate postags attribute for all corpora
        """
        for corpus in self:
            corpus.postags

    @lazyprop
    def wordclasses(self):
        """
        Generate wordclasses attribute for all corpora
        """
        for corpus in self:
            corpus.wordclasses

    @lazyprop
    def lexicon(self):
        """
        Generate lexicon attribute for all corpora
        """
        for corpus in self:
            corpus.lexicon