Source code for core

# -*- coding: utf-8 -*-

import sys, os
sys.path.append( os.path.dirname(os.path.realpath(__file__) ) )

import os
import random
import types

from matplotlib import pyplot as plt

from loaders import common as datacommon
import importlib

MY_DIR = os.path.dirname(os.path.realpath(__file__))
MY_DIR = MY_DIR.replace('core.', '') ## XXX: hack, should be fixed
DATA_DIR = './data/' ## by default the data comes here

## toggle to check if running in iPYTHON notebook
IPYTHON_NOTEBOOK = False
try:
    get_ipython
    IPYTHON_NOTEBOOK = True
except:
    pass

if IPYTHON_NOTEBOOK:
    from IPython.core.display import HTML, display
    display( HTML('<p><script src="https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.6/d3.js"></script>Visualisations enabled.</p>') )

[docs]def set_data_path( path ): """ Sets the path where the data is stored. Relative to where you run your Python. :param path: Where the data is stored. :type path: str :Example: * ``core.set_data_path('.') ## search for data from the current folder.`` * ``core.set_data_path('~/Documents/data/hybra-data') ## data in folder Documents/data/hybra-data.`` """ global DATA_DIR DATA_DIR = path ## when data path is set, automatically print out the versions for folder in os.listdir( path ): if( os.path.isdir( path + folder ) ): datacommon._version( folder ) return None
[docs]def data_path(): """ Returns the existing data path. """ global DATA_DIR return DATA_DIR
[docs]def data_sources(): """ Lists possible data sources hybra core can parse. """ global MY_DIR return map( lambda x: x.replace('.py', ''), filter( lambda x: not x.startswith('_') and x.endswith('.py'), os.listdir( MY_DIR + '/loaders/') ) )
[docs]def data( source, folder = '', **kwargs ): """ Load data of type `source` using the parser for that data. :param source: Type of data loaded. Can be `facebook`, `media`, `twitter`. :type source: str :param folder: Folder under data path that contains the data to be loaded. :type folder: str :Kwargs: * *terms* (*list*) -- If source is `facebook`, `news` or `twitter`. Terms to be searched for in data filenames. Given as strings. * *data_dir* (*str*) -- Data directory to override set data path. :Example: ``core.data('news', terms = ['uutiset'], folder = 'yle') ## load news data from files with filename containing the term 'uutiset' from the subfolder YLE in your data folder.`` """ global DATA_DIR if source not in data_sources(): raise NameError('Unknown media type') loader = importlib.import_module( 'loaders.' + source ) kwargs['folder'] = folder if 'data_dir' not in kwargs: kwargs['data_dir'] = DATA_DIR return loader.load( **kwargs )
[docs]def describe( data ): """ Describe the dataset `data`, showing the amount of posts, number of authors, historical data and more detailed data sources. :param data: Data entries. Given as generator or list. :type data: generator or list """ import descriptives from IPython.core.display import display, HTML return display( HTML( descriptives.describe( data ) ) )
[docs]def timeline( datasets = [], **kwargs ): """ Draws a timeline the dataset `data`. :param datasets: Datasets to plot. Given as generators or lists. :type datasets: list :Kwargs: * *colors* (*list*) -- List of css colors given as strings to be used in drawing the timeline plots. :Example: ``core.timeline(datasets[news_data, fb_data], colors = ['blue', 'red']) ## Plots the dataset `news_data` as blue timeline and the dataset `fb_data` as red timeline.`` """ from timeline import module_timeline from IPython.core.display import display, HTML kwargs['datasets'] = datasets return display( HTML( module_timeline.create_timeline( **kwargs ) ) )
[docs]def network( data ): """ Draws a network the dataset `data`. :param data: Data entries. :type data: generator or list """ from network import module_network from IPython.core.display import display, HTML return display( HTML( module_network.create_network( data ) ) )
[docs]def wordcloud( data, **kwargs ): """ Draws a wordcloud the dataset `data`. :param data: Data entries. :type data: generator or list :Kwargs: * *stopwords* (*list*) -- Words to be ignored in generating the wordcloud. Given as strings. """ import wordclouds as module_wordclouds module_wordclouds.create_wordcloud( data, plt, **kwargs )
[docs]def analyse( script, **kwargs ): """ Run R code given in parameter `script` using rpy2. You can provide the R code python variables in kwargs and those are automatically transfered to suitable R format. :param script: R code or a path to script to be run. :type script: str :Kwargs: Parameters and their values with which to parameterize the R script. :Example: ``core.analyse( \"\"\"t <- table( df$a, df$b) print( chisq.test( t ) ) \"\"\", df = data) ## Runs the χ²-test to examine the expected cross-tabulated frequencies of a and b to observed frequeincies in data. data is a list of dictonaries, each dictonary having a and b variables.`` """ from analysis.run import run globalenv = None if 'previous' in kwargs: globalenv = kwargs[ g ] del kwargs['previous'] return run( script, globalenv, **kwargs )
[docs]def export( data, file_path ): """ Export the dataset `data` in common format to the given file format. Recognizes output format from file extension in given file path. Accepted formats: .csv, .xlsx :param data: Data entries to be exported. :type data: generator or list :param file_path: Path to output file. :type file_path: str :Example: ``core.export(data, 'exported_data.csv') ## Exports data in common format to file 'exported_data.csv' in current path.`` """ from helpers import exporter file_type = file_path.split('.')[-1] try: file_exporter = getattr( exporter, 'export_' + file_type ) file_exporter( data, file_path ) except Exception, e: print(repr(e)) print("File export failed. Supported file types:") for f in filter( lambda x: x.startswith('export_') , dir( exporter ) ): print( '.' + f.replace('export_', '') )
[docs]def sample(data, size, seed = 100, export_file = None): """ Takes a random sample of the dataset `data`. Exports the sample to file using the core module export method if the parameter `export_file` is not None. :param data: Data entries to be sampled. :type data: generator or list :param size: An integer value specifying the sample size. :type size: int :param seed: Seed to use in randomization. Defaults to 100. :type seed: int :param export_file: Path to output file. Defaults to None. :type export_file: None or str :Example: ``core.sample(data, 100, seed = 0, export_file = 'exported_sample.csv') ## Takes a random sample of dataset `data` using the seed 0 and exports it to file 'exported_sample.csv' in current path.`` """ if isinstance( data, types.GeneratorType ): data = list( data ) random.seed(seed) data_sample = random.sample(data, size) if export_file: export( data_sample, export_file ) return data_sample
[docs]def unduplicate( data ): """ Removes all dulicates from `data` and returns only unique items. :param data: Entries of data with potential duplicates. :type data: generator or list """ _data = {} for d in data: _data[ d['id'] ] = d return _data.values()
[docs]def filter_by( data, filter_type, **kwargs ): """ Filters the dataset `data` with the filter given in `filter_type`. Returns the filtered data if `filter_type` matches a filtering method in the modude filters. :param data: Data entries to be filtered. :type data: generator or list :param filter_type: Filter type to be used. Can be `text`, `datetime`, `author` or `domain`. :type filter_type: str :Kwargs: * *text* (*list*) -- If filter_type is `text`. List of strings to use for filtering. * *substrings* (*bool*) -- If filter_type is `text`. If True, will search substrings in text content for terms given in parameter `text`. Defaults to True. * *inclusive* (*bool*) -- If filter_type is `text`. If True, returns only entries with all terms given in parameter `text`. Defaults to True. * *after* (*str*) -- Date and time after which to return entries. * *before* (*str*) -- Date and time before which to return entries. * *authors* (*list*) -- If filter_type is `author`. List of authors as strings to filter by. * *domains* (*list*) -- If filter_type is `domain`. List of domains as strings to filter by. :Example: * ``core.filter_by(data, 'text', text = ['research']) ## Return from dataset `data` entries which include the term 'research' in text content.`` * ``core.filter_by(data, 'text', text = ['research', 'science'], substrings = False, inclusive = False) ## Return from dataset `data` entries which include the term 'research' or the term 'science' in text content as full strings.`` * ``core.filter_by(data, 'datetime', after = '2015-2-15') ## Return from dataset `data` entries with timestamp after the date '2015-2-15'.`` * ``core.filter_by(data, 'datetime', after = '2017-1-1', before = '2017-6-30 18:00:00') ## Return from dataset `data` entries with timestamp after the date '2017-1-1' and before the time '2017-6-30 18:00:00'.`` * ``core.filter_by(data, 'author', authors = ['author1', 'author2']) ## Return from dataset `data` entries which have 'author1' or 'author2' as creator.`` * ``core.filter_by(data, 'domain', domains = ['domain1.com', 'domain2.net']) ## Return from dataset `data` entries which are from domains 'domain1.com' or 'domain2.net'.`` """ from helpers import filters try: filter_helper = getattr( filters, 'filter_by_' + filter_type ) return filter_helper( data, **kwargs ) except Exception, e: print(repr(e)) print('Data filtering failed. Supported filters:') for f in filter(lambda x: x.startswith('filter_by_'), dir(filters) ): print( f.replace('filter_by_', '') )
[docs]def counts( data, count_by, verbose = False ): """ Counts the occurrences of the feature `count_by` in the dataset `data`. Returns the counts as a Counter object and prints them if `verbose` is True. :param data: Data entries to be counted. :type data: generator or list :param count_by: The feature to be used for counting. Can be `author` or `domain`. :type count_by: str :param verbose: If True, prints the counts. Defaults to False. :type verbose: bool :Example: * ``core.counts(data, count_by = 'author') ## counts distinct authors in data.`` * ``core.counts(data, count_by = 'domain', verbose = True) ## counts distinct domains in data and print the counts.`` """ from helpers import counters try: counts_helper = getattr( counters, 'counts_' + count_by ) return counts_helper( data, verbose ) except Exception, e: print(repr(e)) print("Getting counts failed. Supported features to count by:") for c in filter( lambda x: x.startswith('counts_'), dir( counters ) ): print( c.replace('counts_', '') )