Source code for corpkit.interrogation

"""
corpkit: `Int`errogation and Interrogation-like classes
"""

from __future__ import print_function

from collections import OrderedDict
import pandas as pd
from corpkit.process import classname

[docs]class Interrogation(object):
    """
    Stores results of a corpus interrogation, before or after editing. The main 
    attribute, :py:attr:`~corpkit.interrogation.Interrogation.results`, is a 
    Pandas object, which can be edited or plotted.
    """

    def __init__(self, results=None, totals=None, query=None, concordance=None):
        """Initialise the class"""
        self.results = results
        """pandas `DataFrame` containing counts for each subcorpus"""
        self.totals = totals
        """pandas `Series` containing summed results"""
        self.query = query
        """`dict` containing values that generated the result"""
        self.concordance = concordance
        """pandas `DataFrame` containing concordance lines, if concordance lines were requested."""

    def __str__(self):
        if self.query.get('corpus'):
            prst = getattr(self.query['corpus'], 'name', self.query['corpus'])
        else:
            try:
                prst = self.query['interrogation'].query['corpus'].name
            except:
                prst = 'edited'
        st = 'Corpus interrogation: %s\n\n' % (prst)
        return st

    def __repr__(self):
        try:
            return "<%s instance: %d total>" % (classname(self), self.totals.sum())
        except AttributeError:
            return "<%s instance: %d total>" % (classname(self), self.totals)

[docs]    def edit(self, *args, **kwargs):
        """
        Manipulate results of interrogations.

        There are a few overall kinds of edit, most of which can be combined 
        into a single function call. It's useful to keep in mind that many are 
        basic wrappers around `pandas` operations---if you're comfortable with 
        `pandas` syntax, it may be faster at times to use its syntax instead.

        :Basic mathematical operations:

        First, you can do basic maths on results, optionally passing in some 
        data to serve as the denominator. Very commonly, you'll want to get 
        relative frequencies:

        :Example: 

        >>> data = corpus.interrogate({W: r'^t'})
        >>> rel = data.edit('%', SELF)
        >>> rel.results
            ..    to  that   the  then ...   toilet  tolerant  tolerate  ton
            01 18.50 14.65 14.44  6.20 ...     0.00      0.00      0.11 0.00
            02 24.10 14.34 13.73  8.80 ...     0.00      0.00      0.00 0.00
            03 17.31 18.01  9.97  7.62 ...     0.00      0.00      0.00 0.00

        For the operation, there are a number of possible values, each of 
        which is to be passed in as a `str`:

           `+`, `-`, `/`, `*`, `%`: self explanatory

           `k`: calculate keywords

           `a`: get distance metric
        
        `SELF` is a very useful shorthand denominator. When used, all editing 
        is performed on the data. The totals are then extracted from the edited 
        data, and used as denominator. If this is not the desired behaviour, 
        however, a more specific `interrogation.results` or 
        `interrogation.totals` attribute can be used.

        In the example above, `SELF` (or `'self'`) is equivalent to:

        :Example:

        >>> rel = data.edit('%', data.totals)

        :Keeping and skipping data:

        There are four keyword arguments that can be used to keep or skip rows 
        or columns in the data:

        * `just_entries`
        * `just_subcorpora`
        * `skip_entries`
        * `skip_subcorpora`

        Each can accept different input types:

        * `str`: treated as regular expression to match
        * `list`: 

          * of integers: indices to match
          * of strings: entries/subcorpora to match

        :Example:

        >>> data.edit(just_entries=r'^fr', 
        ...           skip_entries=['free','freedom'],
        ...           skip_subcorpora=r'[0-9]')

        :Merging data:

        There are also keyword arguments for merging entries and subcorpora:

        * `merge_entries`
        * `merge_subcorpora`

        These take a `dict`, with the new name as key and the criteria as 
        value. The criteria can be a str (regex) or wordlist.

        :Example:
        
        >>> from dictionaries.wordlists import wordlists
        >>> mer = {'Articles': ['the', 'an', 'a'], 'Modals': wordlists.modals}
        >>> data.edit(merge_entries=mer)

        :Sorting:

        The `sort_by` keyword argument takes a `str`, which represents the way 
        the result columns should be ordered.

        * `increase`: highest to lowest slope value
        * `decrease`: lowest to highest slope value
        * `turbulent`: most change in y axis values
        * `static`: least change in y axis values
        * `total/most`: largest number first
        * `infreq/least`: smallest number first
        * `name`: alphabetically

        :Example:

        >>> data.edit(sort_by='increase')

        :Editing text:
        
        Column labels, corresponding to individual interrogation results, can 
        also be edited with `replace_names`.

        :param replace_names: Edit result names, then merge duplicate entries
        :type replace_names: `str`/`list of tuples`/`dict`

        If `replace_names` is a string, it is treated as a regex to delete from 
        each name. If `replace_names` is a dict, the value is the regex, and 
        the key is the replacement text. Using a list of tuples in the form 
        `(find, replacement)` allows duplicate substitution values.

        :Example:

        >>> data.edit(replace_names={r'object': r'[di]obj'})

        :param replace_subcorpus_names: Edit subcorpus names, then merge duplicates.
                                        The same as `replace_names`, but on the other axis.
        :type replace_subcorpus_names: `str`/`list of tuples`/`dict`

        :Other options:

        There are many other miscellaneous options.

        :param keep_stats: Keep/drop stats values from dataframe after sorting
        :type keep_stats: `bool`
        
        :param keep_top: After sorting, remove all but the top *keep_top* results
        :type keep_top: `int`
        
        :param just_totals: Sum each column and work with sums
        :type just_totals: `bool`
        
        :param threshold: When using results list as dataframe 2, drop values 
                          occurring fewer than n times. If not keywording, you 
                          can use:
                                
           `'high'`: `denominator total / 2500`
           
           `'medium'`: `denominator total / 5000`
           
           `'low'`: `denominator total / 10000`
                            
           If keywording, there are smaller default thresholds

        :type threshold: `int`/`bool`

        :param span_subcorpora: If subcorpora are numerically named, span all 
                                from *int* to *int2*, inclusive
        :type span_subcorpora: `tuple` -- `(int, int2)`

        :param projection: multiply results in subcorpus by n
        :type projection: tuple -- `(subcorpus_name, n)`
        :param remove_above_p: Delete any result over `p`
        :type remove_above_p: `bool`

        :param p: set the p value
        :type p: `float`
        
        :param revert_year: When doing linear regression on years, turn annual 
                            subcorpora into 1, 2 ...
        :type revert_year: `bool`
        
        :param print_info: Print stuff to console showing what's being edited
        :type print_info: `bool`
        
        :param spelling: Convert/normalise spelling:
        :type spelling: `str` -- `'US'`/`'UK'`

        :Keywording options:

        If the operation is `k`, you're calculating keywords. In this case,
        some other keyword arguments have an effect:

        :param keyword_measure: what measure to use to calculate keywords:

           `ll`: log-likelihood
           `pd': percentage difference

        type keyword_measure: `str`
        
        :param selfdrop: When keywording, try to remove target corpus from 
                         reference corpus
        :type selfdrop: `bool`
        
        :param calc_all: When keywording, calculate words that appear in either 
                         corpus
        :type calc_all: `bool`

        :returns: :class:`corpkit.interrogation.Interrogation`
        """
        from corpkit.editor import editor
        return editor(self, *args, **kwargs)

[docs]    def sort(self, way, **kwargs):
        from corpkit.editor import editor
        return editor(self, sort_by=way, **kwargs)

[docs]    def visualise(self,
                  title='',
                  x_label=None,
                  y_label=None,
                  style='ggplot',
                  figsize=(8, 4),
                  save=False,
                  legend_pos='best',
                  reverse_legend='guess',
                  num_to_plot=7,
                  tex='try',
                  colours='Accent',
                  cumulative=False,
                  pie_legend=True,
                  rot=False,
                  partial_pie=False,
                  show_totals=False,
                  transparent=False,
                  output_format='png',
                  interactive=False,
                  black_and_white=False,
                  show_p_val=False,
                  indices=False,
                  transpose=False,
                  **kwargs
                 ):
        """Visualise corpus interrogations using `matplotlib`.

        :Example:

        >>> data.visualise('An example plot', kind='bar', save=True)
        <matplotlib figure>

        :param title: A title for the plot
        :type title: `str`
        :param x_label: A label for the x axis
        :type x_label: `str`
        :param y_label: A label for the y axis
        :type y_label: `str`
        :param kind: The kind of chart to make
        :type kind: `str` (`'line'`/`'bar'`/`'barh'`/`'pie'`/`'area'`/`'heatmap'`)
        :param style: Visual theme of plot
        :type style: `str` ('ggplot'/'bmh'/'fivethirtyeight'/'seaborn-talk'/etc)
        :param figsize: Size of plot
        :type figsize: `tuple` -- `(int, int)`
        :param save: If `bool`, save with *title* as name; if `str`, use `str` as name
        :type save: `bool`/`str`
        :param legend_pos: Where to place legend
        :type legend_pos: `str` ('upper right'/'outside right'/etc)
        :param reverse_legend: Reverse the order of the legend
        :type reverse_legend: `bool`
        :param num_to_plot: How many columns to plot
        :type num_to_plot: `int`/'all'
        :param tex: Use TeX to draw plot text
        :type tex: `bool`
        :param colours: Colourmap for lines/bars/slices
        :type colours: `str`
        :param cumulative: Plot values cumulatively
        :type cumulative: `bool`
        :param pie_legend: Show a legend for pie chart
        :type pie_legend: `bool`
        :param partial_pie: Allow plotting of pie slices only
        :type partial_pie: `bool`
        :param show_totals: Print sums in plot where possible
        :type show_totals: `str` -- 'legend'/'plot'/'both'
        :param transparent: Transparent .png background
        :type transparent: `bool`
        :param output_format: File format for saved image
        :type output_format: `str` -- 'png'/'pdf'
        :param black_and_white: Create black and white line styles
        :type black_and_white: `bool`
        :param show_p_val: Attempt to print p values in legend if contained in df
        :type show_p_val: `bool`
        :param indices: To use when plotting "distance from root"
        :type indices: `bool`
        :param stacked: When making bar chart, stack bars on top of one another
        :type stacked: `str`
        :param filled: For area and bar charts, make every column sum to 100
        :type filled: `str`
        :param legend: Show a legend
        :type legend: `bool`
        :param rot: Rotate x axis ticks by *rot* degrees
        :type rot: `int`
        :param subplots: Plot each column separately
        :type subplots: `bool`
        :param layout: Grid shape to use when *subplots* is True
        :type layout: `tuple` -- `(int, int)`
        :param interactive: Experimental interactive options
        :type interactive: `list` -- `[1, 2, 3]`
        :returns: matplotlib figure
        """
        from corpkit.plotter import plotter
        branch = kwargs.pop('branch', 'results')
        if branch.lower().startswith('r'):
            to_plot = self.results
        elif branch.lower().startswith('t'):
            to_plot = self.totals
        return plotter(to_plot,
                       title=title,
                       x_label=x_label,
                       y_label=y_label,
                       style=style,
                       figsize=figsize,
                       save=save,
                       legend_pos=legend_pos,
                       reverse_legend=reverse_legend,
                       num_to_plot=num_to_plot,
                       tex=tex,
                       rot=rot,
                       colours=colours,
                       cumulative=cumulative,
                       pie_legend=pie_legend,
                       partial_pie=partial_pie,
                       show_totals=show_totals,
                       transparent=transparent,
                       output_format=output_format,
                       interactive=interactive,
                       black_and_white=black_and_white,
                       show_p_val=show_p_val,
                       indices=indices,
                       transpose=transpose,
                       **kwargs
                      )

[docs]    def multiplot(self, leftdict={}, rightdict={}, **kwargs):
        from corpkit.plotter import multiplotter
        return multiplotter(self, leftdict=leftdict, rightdict=rightdict, **kwargs)

[docs]    def language_model(self, name, *args, **kwargs):
        """
        Make a language model from an Interrogation. This is usually done 
        directly on a :class:`corpkit.corpus.Corpus` object with the 
        :func:`~corpkit.corpus.Corpus.make_language_model` method.
        """
        from corpkit.model import _make_model_from_interro
        multi = self.multiindex()
        order = len(self.query['show'])
        return _make_model_from_interro(multi, name, order=order, *args, **kwargs)

[docs]    def save(self, savename, savedir='saved_interrogations', **kwargs):
        """
        Save an interrogation as pickle to ``savedir``.

        :Example:
        
        >>> o = corpus.interrogate(W, 'any')
        ### create ./saved_interrogations/savename.p
        >>> o.save('savename')
        
        :param savename: A name for the saved file
        :type savename: `str`
        
        :param savedir: Relative path to directory in which to save file
        :type savedir: `str`
        
        :param print_info: Show/hide stdout
        :type print_info: `bool`
        
        :returns: None
        """
        from corpkit.other import save
        save(self, savename, savedir=savedir, **kwargs)

[docs]    def quickview(self, n=25):
        """view top n results as painlessly as possible.

        :Example:
        
        >>> data.quickview(n=5)
            0: to    (n=2227)
            1: that  (n=2026)
            2: the   (n=1302)
            3: then  (n=857)
            4: think (n=676)

        :param n: Show top *n* results
        :type n: `int`
        :returns: `None`
        """
        from corpkit.other import quickview
        quickview(self, n=n)

[docs]    def tabview(self, **kwargs):
        import tabview
        tabview.view(self.results, **kwargs)

[docs]    def asciiplot(self,
                  row_or_col_name,
                  axis=0,
                  colours=True,
                  num_to_plot=100,
                  line_length=120,
                  min_graph_length=50,
                  separator_length=4,
                  multivalue=False,
                  human_readable='si',
                  graphsymbol='*',
                  float_format='{:,.2f}',
                  **kwargs):
        """
        A very quick ascii chart for result
        """
        from ascii_graph import Pyasciigraph
        from ascii_graph.colors import Gre, Yel, Red, Blu
        from ascii_graph.colordata import vcolor
        from ascii_graph.colordata import hcolor
        import pydoc

        graph = Pyasciigraph(
                            line_length=line_length,
                            min_graph_length=min_graph_length,
                            separator_length=separator_length,
                            multivalue=multivalue,
                            human_readable=human_readable,
                            graphsymbol=graphsymbol
                            )
        if axis == 0:
            dat = self.results.T[row_or_col_name]
        else:
            dat = self.results[row_or_col_name]
        data = list(zip(dat.index, dat.values))[:num_to_plot]
        if colours:
            pattern = [Gre, Yel, Red]
            data = vcolor(data, pattern)

        out = []
        for line in graph.graph(label=None, data=data, float_format=float_format):
            out.append(line)
        pydoc.pipepager('\n'.join(out), cmd='less -X -R -S')

[docs]    def rel(self, denominator='self', **kwargs):
        return self.edit('%', denominator, **kwargs)

[docs]    def keyness(self, measure='ll', denominator='self', **kwargs):
        return self.edit('k', denominator, **kwargs)

[docs]    def multiindex(self, indexnames=None):
        """Create a `pandas.MultiIndex` object from slash-separated results.

        :Example:

        >>> data = corpus.interrogate({W: 'st$'}, show=[L, F])
        >>> data.results
            ..  just/advmod  almost/advmod  last/amod 
            01           79             12          6 
            02          105              6          7 
            03           86             10          1 
        >>> data.multiindex().results
            Lemma       just almost last first   most 
            Function  advmod advmod amod  amod advmod 
            0             79     12    6     2      3 
            1            105      6    7     1      3 
            2             86     10    1     3      0                                   

        :param indexnames: provide custom names for the new index, or leave blank to guess.
        :type indexnames: `list` of strings

        :returns: :class:`corpkit.interrogation.Interrogation`, with 
                  `pandas.MultiIndex` as 
        :py:attr:`~corpkit.interrogation.Interrogation.results` attribute
        """

        from corpkit.other import make_multi
        return make_multi(self, indexnames=indexnames)

[docs]    def topwords(self, datatype='n', n=10, df=False, sort=True, precision=2):
        """Show top n results in each corpus alongside absolute or relative frequencies.

        :param datatype: show abs/rel frequencies, or keyness
        :type datatype: `str` (n/k/%)
        :param n: number of result to show
        :type n: `int`
        :param df: return a DataFrame
        :type df: `bool`
        :param sort: Sort results, or show as is
        :type sort: `bool`
        :param precision: float precision to show
        :type precision: `int`

        :Example:

        >>> data.topwords(n=5)
            1987           %   1988           %   1989           %   1990           %
            health     25.70   health     15.25   health     19.64   credit      9.22
            security    6.48   cancer     10.85   security    7.91   health      8.31
            cancer      6.19   heart       6.31   cancer      6.55   downside    5.46
            flight      4.45   breast      4.29   credit      4.08   inflation   3.37
            safety      3.49   security    3.94   safety      3.26   cancer      3.12

        :returns: None
        """
        from corpkit.other import topwords
        if df:
            return topwords(self, datatype=datatype, n=n, df=True,
                            sort=sort, precision=precision)
        else:
            topwords(self, datatype=datatype, n=n,
                     sort=sort, precision=precision)



[docs]    def perplexity(self):
        """
        Pythonification of the formal definition of perplexity.

        input:  a sequence of chances (any iterable will do)
        output: perplexity value.

        from https://github.com/zeffii/NLP_class_notes
        """

        def _perplex(chances):
            import math
            chances = [i for i in chances if i] 
            N = len(chances)
            product = 1
            for chance in chances:
                product *= chance
            return math.pow(product, -1/N)

        return self.results.apply(_perplex, axis=1)

[docs]    def entropy(self):
        """
        entropy(pos.edit(merge_entries=mergetags, sort_by='total').results.T
        """
        from scipy.stats import entropy
        import pandas as pd
        escores = entropy(self.rel().results.T)
        ser = pd.Series(escores, index=self.results.index)
        ser.name = 'Entropy'
        return ser

[docs]    def shannon(self):
        from corpkit.stats import shannon
        return shannon(self)

[docs]class Concordance(pd.core.frame.DataFrame):
    """
    A class for concordance lines, with methods for saving, formatting and editing.
    """
    
    def __init__(self, data):

        super(Concordance, self).__init__(data)
        self.concordance = data

[docs]    def format(self, kind='string', n=100, window=35,
               print_it=True, columns='all', metadata=True, **kwargs):
        """
        Print concordance lines nicely, to string, LaTeX or CSV

        :param kind: output format: `string`/`latex`/`csv`
        :type kind: `str`
        :param n: Print first `n` lines only
        :type n: `int`/`'all'`
        :param window: how many characters to show to left and right
        :type window: `int`
        :param columns: which columns to show
        :type columns: `list`

        :Example:

        >>> lines = corpus.concordance({T: r'/NN.?/ >># NP'}, show=L)
        ### show 25 characters either side, 4 lines, just text columns
        >>> lines.format(window=25, n=4, columns=[L,M,R])
            0                  we 're in  tucson     , then up north to flagst
            1  e 're in tucson , then up  north      to flagstaff , then we we
            2  tucson , then up north to  flagstaff  , then we went through th
            3   through the grand canyon  area       and then phoenix and i sp

        :returns: None
        """
        from corpkit.other import concprinter
        if print_it:
            print(concprinter(self, kind=kind, n=n, window=window,
                           columns=columns, return_it=True, metadata=metadata, **kwargs))
        else:
            return concprinter(self, kind=kind, n=n, window=window,
                           columns=columns, return_it=True, metadata=metadata, **kwargs)

[docs]    def calculate(self):
        """Make new Interrogation object from (modified) concordance lines"""
        from corpkit.process import interrogation_from_conclines
        return interrogation_from_conclines(self)

[docs]    def shuffle(self, inplace=False):
        """Shuffle concordance lines

        :param inplace: Modify current object, or create a new one
        :type inplace: `bool`

        :Example:

        >>> lines[:4].shuffle()
            3  01  1-01.txt.conll   through the grand canyon  area       and then phoenix and i sp
            1  01  1-01.txt.conll  e 're in tucson , then up  north      to flagstaff , then we we
            0  01  1-01.txt.conll                  we 're in  tucson     , then up north to flagst
            2  01  1-01.txt.conll  tucson , then up north to  flagstaff  , then we went through th

        """
        import random
        index = list(self.index)
        random.shuffle(index)
        shuffled = self.ix[index]
        shuffled.reset_index()
        if inplace:
            self = shuffled
        else:
            return shuffled

[docs]    def edit(self, *args, **kwargs):
        """
        Delete or keep rows by subcorpus or by middle column text.

        >>> skipped = conc.edit(skip_entries=r'to_?match')
        """

        from corpkit.editor import editor
        return editor(self, *args, **kwargs)

    def __str__(self):
        return self.format(print_it=False)

    def __repr__(self):
        return self.format(print_it=False)

[docs]    def less(self, **kwargs):
        import pydoc
        pydoc.pipepager(self.format(print_it=False, **kwargs), cmd='less -X -R -S')

[docs]class Interrodict(OrderedDict):
    """
    A class for interrogations that do not fit in a single-indexed DataFrame.

    Individual interrogations can be looked up via dict keys, indexes or attributes:

    :Example:

    >>> out_data['WSJ'].results
    >>> out_data.WSJ.results
    >>> out_data[3].results

    Methods for saving, editing, etc. are similar to 
    :class:`corpkit.corpus.Interrogation`. Additional methods are available for 
    collapsing into single (multi-indexed) DataFrames.

    This class is now deprecated, in favour of a multiindexed DataFrame.
    """
    
    def __init__(self, data):
        from corpkit.process import makesafe
        if isinstance(data, list):
            data = OrderedDict(data)
        # attribute access
        for k, v in data.items():
            setattr(self, makesafe(k), v)
        self.query = None
        super(Interrodict, self).__init__(data)

    def __getitem__(self, key):
        """allow slicing, indexing"""
        from corpkit.process import makesafe
        # allow slicing
        if isinstance(key, slice):
            n = OrderedDict()
            for ii in range(*key.indices(len(self))):
                n[self.keys()[ii]] = self[ii]
            return Interrodict(n)
        # allow integer index
        elif isinstance(key, int):
            return next(v for i, (k, v) in enumerate(self.items()) if i == key)
            #return self.subcorpora.__getitem__(makesafe(self.subcorpora[key]))
        # dict key access
        else:
            try:
                return OrderedDict.__getitem__(self, key)
            except:
                from corpkit.process import is_number
                if is_number(key):
                    return self.__getattribute__('c' + key)

    def __setitem__(self, key, value):
        from corpkit.process import makesafe
        setattr(self, makesafe(key), value)
        super(Interrodict, self).__setitem__(key, value)
        
    def __repr__(self):
        return "<%s instance: %d items>" % (classname(self), len(self))

    def __str__(self):
        return "<%s instance: %d items>" % (classname(self), len(self))

[docs]    def edit(self, *args, **kwargs):
        """Edit each value with :func:`~corpkit.interrogation.Interrogation.edit`.

        See :func:`~corpkit.interrogation.Interrogation.edit` for possible arguments.

        :returns: A :class:`corpkit.interrogation.Interrodict`
        """

        from corpkit.editor import editor
        return editor(self, *args, **kwargs)

[docs]    def multiindex(self, indexnames=False):

        """Create a `pandas.MultiIndex` version of results.

        :Example:

        >>> d = corpora.interrogate({F: 'compound', GL: '^risk'}, show=L)
        >>> d.keys()
            ['CHT', 'WAP', 'WSJ']
        >>> d['CHT'].results
            ....  health  cancer  security  credit  flight  safety  heart
            1987      87      25        28      13       7       6      4
            1988      72      24        20      15       7       4      9
            1989     137      61        23      10       5       5      6
        >>> d.multiindex().results
            ...               health  cancer  credit  security  downside  
            Corpus Subcorpus                                             
            CHT    1987           87      25      13        28        20 
                   1988           72      24      15        20        12 
                   1989          137      61      10        23        10                                                      
            WAP    1987           83      44       8        44        10 
                   1988           83      27      13        40         6 
                   1989           95      77      18        25        12 
            WSJ    1987           52      27      33         4        21 
                   1988           39      11      37         9        22 
                   1989           55      47      43         9        24 

        :returns: A :class:`corpkit.interrogation.Interrogation`
        """

        import pandas as pd
        import numpy as np
        from itertools import product
        from corpkit.interrogation import Interrodict, Interrogation

        query = self.query

        def trav(dct, parents={}, level=0, colset=set(), results=list(), conc_results=list(), myparname=[]):
            """
            Traverse the Interrodict and flatten it out
            """
            from collections import defaultdict
            columns = False
            if hasattr(dct, 'items'):
                parents[level] = list(dct.keys())
                level += 1
                for k, v in list(dct.items()):
                    pars = myparname + [k]
                    # the below is only for python3
                    #pars = [*myparname, k]
                    trav(v, parents=parents, level=level,
                         results=results, myparname=pars)
            else:
                if parents.get(level):
                    parents[level] |= set(dct.results.index)
                else:
                    parents[level] = set(dct.results.index)

                if not dct.results.empty:
                    if dct.concordance is not None:
                      conc_results.append(dct.concordance)
                    for n, ser in dct.results.iterrows():
                        ser.name = tuple(myparname + [ser.name])
                        #ser.name = (*myparname, ser.name)
                        results.append(ser)
                    for c in list(dct.results.columns):
                        colset.add(c)
                    level += 1
            return results, conc_results

        data, conc = trav(self)
        index = [i.name for i in data]
        if conc:
            conc = pd.concat(conc)
        else:
            conc = None

        # todo: better default for speakers?
        if isinstance(self.query, dict) and self.query.get('subcorpora'):
            nms = {'names': self.query['subcorpora']}
        elif indexnames:
            nms = {'names': indexnames}
        else:
            nms = {} 
        ix = pd.MultiIndex.from_tuples(index, **nms)
        df = pd.DataFrame(data, index=ix)
        df = df.fillna(0).astype(int)
        df = df[df.sum().sort_values(ascending=False).index]
        totals = df.sum(axis=1)
        return Interrogation(results=df, totals=totals, query=query, concordance=conc)

[docs]    def save(self, savename, savedir='saved_interrogations', **kwargs):
        """
        Save an interrogation as pickle to `savedir`.

        :param savename: A name for the saved file
        :type savename: `str`
        
        :param savedir: Relative path to directory in which to save file
        :type savedir: `str`
        
        :param print_info: Show/hide stdout
        :type print_info: `bool`
        
        :Example: 

        >>> o = corpus.interrogate(W, 'any')
        ### create ``saved_interrogations/savename.p``
        >>> o.save('savename')

        :returns: None
        """
        from corpkit.other import save
        save(self, savename, savedir=savedir, **kwargs)

[docs]    def collapse(self, axis='y'):
        """
        Collapse Interrodict on an axis or along interrogation name.

        :param axis: collapse along x, y or name axis
        :type axis: `str`: x/y/n

        :Example:

        .. code-block:: python

           >>> d = corpora.interrogate({F: 'compound', GL: r'^risk'}, show=L)
           
           >>> d.keys()
               ['CHT', 'WAP', 'WSJ']
           
           >>> d['CHT'].results
               ....  health  cancer  security  credit  flight  safety  heart
               1987      87      25        28      13       7       6      4
               1988      72      24        20      15       7       4      9
               1989     137      61        23      10       5       5      6
           
           >>> d.collapse().results
               ...  health  cancer  credit  security
               CHT    3174    1156     566       697
               WAP    2799     933     582      1127
               WSJ    1812     680    2009       537
           
           >>> d.collapse(axis='x').results
               ...  1987  1988  1989
               CHT   384   328   464
               WAP   389   355   435
               WSJ   428   410   473
           
           >>> d.collapse(axis='key').results
               ...   health  cancer  credit  security
               1987     282     127      65        93
               1988     277     100      70       107
               1989     379     253      83        91
               
        :returns: A :class:`corpkit.interrogation.Interrogation`
        """
        # join on keys ... probably shouldn't transpose like this though!
        if axis.lower()[0] not in ['x', 'y']:
            df = self.values()[0].results
            others = [i.results.T for i in list(self.values())[1:]]
            try:
                df = df.T.join(others).T
            except ValueError:
                for i in self.values()[1:]:
                    df = df.add(i.results, fill_value=0)

            df = df.fillna(0)
        else:
            out = []
            for corpus_name, interro in self.items():
                if axis.lower().startswith('y'):
                    ax = 0
                elif axis.lower().startswith('x'):
                    ax = 1
                data = interro.results.sum(axis=ax)
                data.name = corpus_name
                out.append(data)
            # concatenate and transpose
            df = pd.concat(out, axis=1).T
            # turn NaN to 0, sort
            df = df.fillna(0)
        
        #make interrogation object from df
        if not axis.lower().startswith('x'):
            df = df.edit(sort_by='total', print_info=False)
        else:
            df = df.edit(print_info=False)
        # make sure everything is int, not float
        for col in list(df.results.columns):
            df.results[col] = df.results[col].astype(int)
        return df

[docs]    def topwords(self, datatype='n', n=10, df=False, sort=True, precision=2):
        """Show top n results in each corpus alongside absolute or relative frequencies.

        :param datatype: show abs/rel frequencies, or keyness
        :type datatype: `str` (n/k/%)
        :param n: number of result to show
        :type n: `int`
        :param df: return a DataFrame
        :type df: `bool`
        :param sort: Sort results, or show as is
        :type sort: `bool`
        :param precision: float precision to show
        :type precision: `int`
        :Example:

        >>> data.topwords(n=5)
            TBT            %   UST            %   WAP            %   WSJ            %
            health     25.70   health     15.25   health     19.64   credit      9.22
            security    6.48   cancer     10.85   security    7.91   health      8.31
            cancer      6.19   heart       6.31   cancer      6.55   downside    5.46
            flight      4.45   breast      4.29   credit      4.08   inflation   3.37
            safety      3.49   security    3.94   safety      3.26   cancer      3.12

        :returns: None
        """
        from corpkit.other import topwords
        if df:
            return topwords(self, datatype=datatype, n=n, df=True,
                            sort=sort, precision=precision)
        else:
            topwords(self, datatype=datatype, n=n,
                     sort=sort, precision=precision)

[docs]    def visualise(self, shape='auto', truncate=8, **kwargs):
        """
        Attempt to visualise Interrodict by using subplots

        :param shape: Layout for the subplots (e.g. `(2, 2)`)
        :type shape: tuple

        :param truncate: Only process the first `n` items in the 
                         class:`corpkit.interrogation.Interrodict`
        :type truncate: `int`

        :param kwargs: specifications to pass to :func:`~corpkit.plotter.plotter`
        :type kwargs: keyword arguments
        """
        import matplotlib.pyplot as plt
        if shape == 'auto':
            shape = (int(len(self) / 2), 2)
        if truncate:
            self = self[:truncate]
        f, axes = plt.subplots(*shape)
        for (name, interro), ax in zip(self.items(), axes.flatten()):
            if kwargs.get('name_format'):
                name = name_format.format(name)
            interro.visualise(name, ax=ax, **kwargs)
        return plt

[docs]    def copy(self):
        from corpkit.interrogation import Interrodict
        copied = {}
        for k, v in self.items():
            copied[k] = v
        return Interrodict(copied)

[docs]    def flip(self, truncate=30, transpose=True, repeat=False, *args, **kwargs):
        """
        Change the dimensions of :class:`corpkit.interrogation.Interrodict`,
        making column names into keys.

        :param truncate: Get first `n` columns
        :type truncate: `int`/`'all'`

        :param transpose: Flip rows and columns:
        :type transpose: `bool`

        :param repeat: Flip twice, to move columns into key position
        :type repeat: `bool`

        :param kwargs: Arguments to pass to the 
                       :func:`~corpkit.interrogation.Interrogation.edit`
                       method

        :returns: :class:`corpkit.interrogation.Interrodict`
        """
        import pandas as pd
        from corpkit.interrogation import Interrodict

        # copy interrodict
        copied = self.copy()

        # first, flip x axis and keys
        words = list(copied.collapse().results.columns)
        if truncate != 'all':
            words = words[:truncate]

        data = {}
        for word in words:
            wordata = []
            for k, v in copied.items():
                try:
                    point = v.results[word]
                except KeyError:
                    ser = [0] * len(v.results.index)
                    point = pd.Series(ser, index=v.results.index)
                point.name = k
                wordata.append(point)
            df = pd.concat(wordata, axis=1)
            if transpose:
                df = df.T
            df = df.edit(*args, **kwargs)
            # divide each newspaper separately
            data[word] = df
        idi = Interrodict(data)
        if repeat:
            return idi.flip(truncate=truncate, transpose=False, repeat=False)
        else:
            return idi

[docs]    def get_totals(self):
        """
        Helper function to concatenate all totals
        """
        lst = []
        # for each interrogation name and data
        for k, v in self.items():
            # get the totals
            tot = v.totals
            # name the totals with the corpus name
            tot.name = k
            # add to a list
            lst.append(tot)
        # turn the list into a dataframe
        return pd.concat(lst, axis=1)