Source code for core

# -*- coding: utf-8 -*-

import sys, os
sys.path.append( os.path.dirname(os.path.realpath(__file__) ) )

import os
import random
import types

from matplotlib import pyplot as plt

from loaders import common as datacommon
import importlib

MY_DIR = os.path.dirname(os.path.realpath(__file__))
MY_DIR = MY_DIR.replace('core.', '') ## XXX: hack, should be fixed
DATA_DIR = './data/' ## by default the data comes here

## toggle to check if running in iPYTHON notebook
IPYTHON_NOTEBOOK = False
try:
    get_ipython
    IPYTHON_NOTEBOOK = True
except:
    pass

if IPYTHON_NOTEBOOK:
    from IPython.core.display import HTML, display
    display( HTML('<p><script src="https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.6/d3.js"></script>Visualisations enabled.</p>') )

[docs]def set_data_path( path ):
    """ Sets the path where the data is stored. Relative to where you run your Python.
        :param path: Where the data is stored.
        :type path: str

        :Example:

        * ``core.set_data_path('.') ## search for data from the current folder.``
        * ``core.set_data_path('~/Documents/data/hybra-data') ## data in folder Documents/data/hybra-data.``
    """
    global DATA_DIR
    DATA_DIR = path

    ## when data path is set, automatically print out the versions
    for folder in os.listdir( path ):
        if( os.path.isdir( path + folder ) ):
            datacommon._version( folder )

    return None

[docs]def data_path():
    """ Returns the existing data path.
    """
    global DATA_DIR
    return DATA_DIR

[docs]def data_sources():
    """ Lists possible data sources hybra core can parse.
    """
    global MY_DIR
    return map( lambda x: x.replace('.py', ''), filter( lambda x: not x.startswith('_') and x.endswith('.py'), os.listdir( MY_DIR + '/loaders/') ) )

[docs]def data( source, folder = '', **kwargs ):
    """ Load data of type `source` using the parser for that data.

        :param source: Type of data loaded. Can be `facebook`, `media`, `twitter`.
        :type source: str
        :param folder: Folder under data path that contains the data to be loaded.
        :type folder: str

        :Kwargs:
            * *terms* (*list*) --
              If source is `facebook`, `news` or `twitter`. Terms to be searched for in data filenames. Given as strings.
            * *data_dir* (*str*) --
              Data directory to override set data path.

        :Example:

        ``core.data('news', terms = ['uutiset'], folder = 'yle') ## load news data from files with filename containing the term 'uutiset' from the subfolder YLE in your data folder.``
    """
    global DATA_DIR

    if source not in data_sources():
        raise NameError('Unknown media type')

    loader = importlib.import_module( 'loaders.' + source )

    kwargs['folder'] = folder

    if 'data_dir' not in kwargs:
        kwargs['data_dir'] = DATA_DIR

    return loader.load( **kwargs )

[docs]def describe( data ):
    """ Describe the dataset `data`, showing the amount of posts,
        number of authors, historical data and more detailed data sources.

        :param data: Data entries. Given as generator or list.
        :type data: generator or list
    """

    import descriptives
    from IPython.core.display import display, HTML

    return display( HTML( descriptives.describe( data ) ) )

[docs]def timeline( datasets = [], **kwargs ):
    """ Draws a timeline the dataset `data`.

        :param datasets: Datasets to plot. Given as generators or lists.
        :type datasets: list

        :Kwargs:
            * *colors* (*list*) --
              List of css colors given as strings to be used in drawing the timeline plots.

        :Example:

        ``core.timeline(datasets[news_data, fb_data], colors = ['blue', 'red']) ## Plots the dataset `news_data` as blue timeline and the dataset `fb_data` as red timeline.``
    """

    from timeline import module_timeline
    from IPython.core.display import display, HTML

    kwargs['datasets'] = datasets

    return display( HTML( module_timeline.create_timeline( **kwargs ) ) )

[docs]def network( data ):
    """ Draws a network the dataset `data`.

        :param data: Data entries.
        :type data: generator or list
    """

    from network import module_network
    from IPython.core.display import display, HTML

    return display( HTML( module_network.create_network( data ) ) )

[docs]def wordcloud( data, **kwargs ):
    """ Draws a wordcloud the dataset `data`.

        :param data: Data entries.
        :type data: generator or list

        :Kwargs:
            * *stopwords* (*list*) --
              Words to be ignored in generating the wordcloud. Given as strings.
    """

    import wordclouds as module_wordclouds

    module_wordclouds.create_wordcloud( data, plt, **kwargs )

[docs]def analyse( script, **kwargs ):
    """ Run R code given in parameter `script` using rpy2.
        You can provide the R code python variables in kwargs and those are automatically transfered to suitable R format.

        :param script: R code or a path to script to be run.
        :type script: str

        :Kwargs:
            Parameters and their values with which to parameterize the R script.

        :Example:

        ``core.analyse( \"\"\"t <- table( df$a, df$b)
        print( chisq.test( t ) )
        \"\"\", df = data)
        ## Runs the χ²-test to examine the expected cross-tabulated frequencies of a and b to observed frequeincies in data. data is a list of dictonaries, each dictonary having a and b variables.``
    """

    from analysis.run import run

    globalenv = None
    if 'previous' in kwargs:
        globalenv = kwargs[ g ]
        del kwargs['previous']

    return run( script, globalenv, **kwargs )

[docs]def export( data, file_path ):
    """ Export the dataset `data` in common format to the given file format.
        Recognizes output format from file extension in given file path.
        Accepted formats: .csv, .xlsx

        :param data: Data entries to be exported.
        :type data: generator or list
        :param file_path: Path to output file.
        :type file_path: str

        :Example:

        ``core.export(data, 'exported_data.csv') ## Exports data in common format to file 'exported_data.csv' in current path.``
    """

    from helpers import exporter

    file_type = file_path.split('.')[-1]

    try:
        file_exporter = getattr( exporter, 'export_' + file_type )

        file_exporter( data, file_path )

    except Exception, e:
        print(repr(e))
        print("File export failed. Supported file types:")

        for f in filter( lambda x: x.startswith('export_') , dir( exporter ) ):
            print( '.' + f.replace('export_', '') )

[docs]def sample(data, size, seed = 100, export_file = None):
    """ Takes a random sample of the dataset `data`.
        Exports the sample to file using the core module export method
        if the parameter `export_file` is not None.

        :param data: Data entries to be sampled.
        :type data: generator or list
        :param size: An integer value specifying the sample size.
        :type size: int
        :param seed: Seed to use in randomization. Defaults to 100.
        :type seed: int
        :param export_file: Path to output file. Defaults to None.
        :type export_file: None or str

        :Example:

        ``core.sample(data, 100, seed = 0, export_file = 'exported_sample.csv') ## Takes a random sample of dataset `data` using the seed 0 and exports it to file 'exported_sample.csv' in current path.``
    """

    if isinstance( data, types.GeneratorType ):
        data = list( data )

    random.seed(seed)

    data_sample = random.sample(data, size)

    if export_file:
        export( data_sample, export_file )

    return data_sample

[docs]def unduplicate( data ):
    """ Removes all dulicates from `data` and returns only unique items.

        :param data: Entries of data with potential duplicates.
        :type data: generator or list
    """

    _data = {}

    for d in data:
        _data[ d['id'] ] = d

    return _data.values()

[docs]def filter_by( data, filter_type, **kwargs ):
    """ Filters the dataset `data` with the filter given in `filter_type`.
        Returns the filtered data if `filter_type` matches a filtering method
        in the modude filters.

        :param data: Data entries to be filtered.
        :type data: generator or list
        :param filter_type: Filter type to be used. Can be `text`, `datetime`, `author` or `domain`.
        :type filter_type: str

        :Kwargs:
            * *text* (*list*) --
              If filter_type is `text`. List of strings to use for filtering.
            * *substrings* (*bool*) --
              If filter_type is `text`. If True, will search substrings in text content for terms given in parameter `text`. Defaults to True.
            * *inclusive* (*bool*) --
              If filter_type is `text`. If True, returns only entries with all terms given in parameter `text`. Defaults to True.
            * *after* (*str*) --
              Date and time after which to return entries.
            * *before* (*str*) --
              Date and time before which to return entries.
            * *authors* (*list*) --
              If filter_type is `author`. List of authors as strings to filter by.
            * *domains* (*list*) --
              If filter_type is `domain`. List of domains as strings to filter by.

        :Example:

        * ``core.filter_by(data, 'text', text = ['research']) ## Return from dataset `data` entries which include the term 'research' in text content.``
        * ``core.filter_by(data, 'text', text = ['research', 'science'], substrings = False, inclusive = False) ## Return from dataset `data` entries which include the term 'research' or the term 'science' in text content as full strings.``
        * ``core.filter_by(data, 'datetime', after = '2015-2-15') ## Return from dataset `data` entries with timestamp after the date '2015-2-15'.``
        * ``core.filter_by(data, 'datetime', after = '2017-1-1', before = '2017-6-30 18:00:00') ## Return from dataset `data` entries with timestamp after the date '2017-1-1' and before the time '2017-6-30 18:00:00'.``
        * ``core.filter_by(data, 'author', authors = ['author1', 'author2']) ## Return from dataset `data` entries which have 'author1' or 'author2' as creator.``
        * ``core.filter_by(data, 'domain', domains = ['domain1.com', 'domain2.net']) ## Return from dataset `data` entries which are from domains 'domain1.com' or 'domain2.net'.``
    """

    from helpers import filters

    try:
        filter_helper = getattr( filters, 'filter_by_' + filter_type )

        return filter_helper( data, **kwargs )

    except Exception, e:
        print(repr(e))
        print('Data filtering failed. Supported filters:')

        for f in filter(lambda x: x.startswith('filter_by_'), dir(filters) ):
            print( f.replace('filter_by_', '') )

[docs]def counts( data, count_by, verbose = False ):
    """ Counts the occurrences of the feature `count_by` in the dataset `data`.
        Returns the counts as a Counter object and prints them if `verbose` is True.

        :param data: Data entries to be counted.
        :type data: generator or list
        :param count_by: The feature to be used for counting. Can be `author` or `domain`.
        :type count_by: str
        :param verbose: If True, prints the counts. Defaults to False.
        :type verbose: bool

        :Example:

        * ``core.counts(data, count_by = 'author') ## counts distinct authors in data.``
        * ``core.counts(data, count_by = 'domain', verbose = True) ## counts distinct domains in data and print the counts.``
    """

    from helpers import counters

    try:
        counts_helper = getattr( counters, 'counts_' + count_by )

        return counts_helper( data, verbose )

    except Exception, e:
        print(repr(e))
        print("Getting counts failed. Supported features to count by:")

        for c in filter( lambda x: x.startswith('counts_'), dir( counters ) ):
            print( c.replace('counts_', '') )