Source code for openest.models.features_interpreter

# -*- coding: utf-8 -*-
################################################################################
# Copyright 2014, The Open Aggregator
#   GNU General Public License, Ver. 3 (see docs/license.txt)
################################################################################

"""Probability Features File

The probability features file has the following format::

  dpc1,<p-header-1>,<p-header-2>,...
  <x-value-1>,g_1(y | x_1),g_2(y | x_1),...
  <x-value-2>,g_1(y | x_2),g_2(y | x_2),...
  ...

``<p-header>`` headers can be any of the following, with the
corresponding values in their rows (``<p-value-ij>``).

  * ``mean``: E y|x_i
  * ``var``: E (y|x_i - E y|x_i)^2
  * ``sdev``: \sqrt{E (y|x_i - E y|x_i)^2}
  * ``skew``: E ((y|x_i - E y|x_i) / sqrt{E (y|x_i - E y|x_i)^2})^3
  * ``mode``: \max f(y | x_i)
  * numeric (0 - 1): F^{-1}(p_j|x_i)

The row headers (``<x-value>``) can be numeric, in which case a
continuous spline bridges them, or categorical strings.

Below is a sample features file::

  dpc1,mean,var
  treated,0,1
  control,4,4
"""
__copyright__ = "Copyright 2014, The Open Aggregator"
__license__ = "GPL"

__author__ = "James Rising"
__credits__ = ["James Rising", "Solomon Hsiang", "Bob Kopp"]
__maintainer__ = "James Rising"
__email__ = "jar2234@columbia.edu"

__status__ = "Production"
__version__ = "$Revision$"
# $Source$

import csv, math, random, copy
import numpy as np
from scipy.interpolate import InterpolatedUnivariateSpline
from scipy.stats import norm, expon
from scipy.optimize import brentq, minimize

from spline_model import SplineModel, SplineModelConditional

[docs]class FeaturesInterpreter:
[docs]    @staticmethod
    def init_from_feature_file(spline, file, delimiter, limits, status_callback=None):
        reader = csv.reader(file, delimiter=delimiter)
        header = reader.next()
        if header[0] != "dpc1":
            raise ValueError("Unknown format for %s" % (header[0]))

        spline.scaled = True

        while header[-1] == '':
            header = header[:-1]

        # Convert header p-values into numerics
        for ii in range(1, len(header)):
            try:
                val = float(header[ii])
                header[ii] = val
            except ValueError:
                pass

        spline.xx = []
        spline.xx_text = []
        spline.xx_is_categorical = False
        spline.conditionals = []

        last_row = None
        last_conditional = None
        for row in reader:
            if last_row and last_row[1:] == row[1:]:
                conditional = last_conditional.copy()
            else:
                conditional = FeaturesInterpreter.make_conditional_respecting(header, row, limits)
                last_row = row
                last_conditional = conditional

            spline.add_conditional(row[0], conditional)

            if status_callback:
                status_callback("Parsing...", reader.line_num / (reader.line_num + 3.0))

        return spline

    # limits is tuple of (low, high)
[docs]    @staticmethod
    def features_to_gaussian(header, row, limits):
        # Does this look like a mean-variance feature file?
        if len(header) == 3:
            mean = None
            if 'mean' in header:
                mean = float(row[header.index('mean')])
            if 'mode' in header:
                mean = float(row[header.index('mode')])
            if .5 in header:
                mean = float(row[header.index(.5)])
            if mean is None:
                return None

            if 'var' in header:
                var = float(row[header.index('var')])
            elif 'sdev' in header:
                var = float(row[header.index('sdev')]) * float(row[header.index('sdev')])
            else:
                return None

            if np.isnan(var) or var == 0:
                return SplineModelConditional.make_single(mean, mean, [])

            # This might be uniform
            if mean - 2*var < limits[0] or mean + 2*var > limits[1]:
                return None

            return SplineModelConditional.make_gaussian(limits[0], limits[1], mean, var)
        elif len(header) == 4:
            # Does this look like a mean and evenly spaced p-values?
            header = header[1:] # Make a copy of the list
            row = row[1:]
            mean = None
            if 'mean' in header:
                mean = float(row.pop(header.index('mean')))
                header.remove('mean')

            elif 'mode' in header:
                mean = float(row.pop(header.index('mode')))
                header.remove('mode')
            elif .5 in header:
                mean = float(row.pop(header.index(.5)))
                header.remove(.5)
            else:
                return None

            # Check that the two other values are evenly spaced p-values
            row = map(float, row[0:2])
            if np.all(np.isnan(row)):
                return SplineModelConditional.make_single(mean, mean, [])

            if header[1] == 1 - header[0] and abs(row[1] - mean - (mean - row[0])) < abs(row[1] - row[0]) / 100.0:
                print "HERE"
                lowp = min(header)
                lowv = np.array(row)[np.array(header) == lowp][0]

                if lowv == mean:
                    return SplineModelConditional.make_single(mean, mean, [])

                lowerbound = 1e-4 * (mean - lowv)
                upperbound = np.sqrt((mean - lowv) / lowp)

                sdev = brentq(lambda sdev: norm.cdf(lowv, mean, sdev) - lowp, lowerbound, upperbound)
                if float(limits[0]) < mean - 3*sdev and float(limits[1]) > mean + 3*sdev:
                    return SplineModelConditional.make_gaussian(limits[0], limits[1], mean, sdev*sdev)
                else:
                    return None
            else:
                # Heuristic best curve: known tails, fit to mean
                lowp = min(header)
                lowv = np.array(row)[np.array(header) == lowp][0]

                lowerbound = 1e-4 * (mean - lowv)
                upperbound = np.log((mean - lowv) / lowp)

                low_sdev = brentq(lambda sdev: norm.cdf(lowv, mean, sdev) - lowp, lowerbound, upperbound)
                if float(limits[0]) > mean - 3*low_sdev:
                    return None

                low_segment = SplineModelConditional.make_gaussian(float(limits[0]), lowv, mean, low_sdev*low_sdev)

                highp = max(header)
                highv = np.array(row)[np.array(header) == highp][0]

                lowerbound = 1e-4 * (highv - mean)
                upperbound = np.log((highv - mean) / (1 - highp))

                high_scale = brentq(lambda scale: .5 + expon.cdf(highv, mean, scale) / 2 - highp, lowerbound, upperbound)
                if float(limits[1]) < mean + 3*high_scale:
                    return None

                # Construct exponential, starting at mean, with full cdf of .5
                high_segment = SplineModelConditional.make_single(highv, float(limits[1]), [np.log(1/high_scale) + np.log(.5) + mean / high_scale, -1 / high_scale])

                sevenys = np.linspace(lowv, highv, 7)
                ys = np.append(sevenys[0:2], [mean, sevenys[-2], sevenys[-1]])

                lps0 = norm.logpdf(ys[0:2], mean, low_sdev)
                lps1 = expon.logpdf([ys[-2], ys[-1]], mean, high_scale) + np.log(.5)

                #bounds = [norm.logpdf(mean, mean, low_sdev), norm.logpdf(mean, mean, high_sdev)]

                result = minimize(lambda lpmean: FeaturesInterpreter.skew_gaussian_evaluate(ys, np.append(np.append(lps0, [lpmean]), lps1), low_segment, high_segment, mean, lowp, highp), .5, method='Nelder-Mead')
                print "Skew Gaussian"
                print np.append(np.append(lps0, result.x), lps1)
                return FeaturesInterpreter.skew_gaussian_construct(ys, np.append(np.append(lps0, result.x), lps1), low_segment, high_segment)

[docs]    @staticmethod
    def skew_gaussian_construct(ys, lps, low_segment, high_segment):
        mid_segment = SplineModelConditional.make_conditional_from_spline(InterpolatedUnivariateSpline(ys, lps, k=2), (low_segment.y1s[0], high_segment.y0s[0]))
        conditional = SplineModelConditional()
        conditional.add_segment(low_segment.y0s[0], low_segment.y1s[0], copy.copy(low_segment.coeffs[0]))
        for ii in range(mid_segment.size()):
            conditional.add_segment(mid_segment.y0s[ii], mid_segment.y1s[ii], mid_segment.coeffs[ii])
        conditional.add_segment(high_segment.y0s[0], high_segment.y1s[0], copy.copy(high_segment.coeffs[0]))

        try:
            conditional.rescale()
        except:
            return None

        return conditional

[docs]    @staticmethod
    def skew_gaussian_evaluate(ys, lps, low_segment, high_segment, mean, lowp, highp):
        conditional = FeaturesInterpreter.skew_gaussian_construct(ys, lps, low_segment, high_segment)
        if conditional is None:
            return SplineModel.posinf

        error = 0
        # Discontinuities:
        error += np.square(conditional.evaluate(0, low_segment.y1s[0]) - conditional.evaluate(1, low_segment.y1s[0]))
        error += np.square(conditional.evaluate(conditional.size() - 2, high_segment.y0s[0]) - conditional.evaluate(conditional.size() - 1, high_segment.y0s[0]))

        # Mean:
        error += np.square(mean - conditional.approximate_mean((low_segment.y0s[0], high_segment.y1s[0])))

        # lps
        error += np.square(conditional.cdf(low_segment.y1s[0]) - lowp)
        error += np.square(conditional.cdf(high_segment.y0s[0]) - highp)

        return error

[docs]    @staticmethod
    def features_to_exponential(header, row, limits):
        if len(header) != 2:
            return None

        if 'mean' not in header:
            return None

        mean = float(row[header.index('mean')])

        # Is it one-sided?
        if mean > limits[0] and limits[0] + (mean - limits[0]) * 3 < limits[1]:
            # positive exponential
            return SplineModelConditional.make_single(limits[0], limits[1], [limits[0] / (mean - limits[0]), -1/(mean - limits[0])]).rescale()
        if mean < limits[1] or limits[1] - (limits[1] - mean) * 3 > limits[0]:
            # negative exponential
            return SplineModelConditional.make_single(limits[0], limits[1], [-limits[1] / (limits[1] - mean), 1/(limits[1] - mean)]).rescale()
        else:
            return None

[docs]    @staticmethod
    def features_to_uniform(header, row, limits):
        if len(header) != 1:
            return None

        return SplineModelConditional.make_single(limits[0], limits[1], [1/(limits[1] - limits[0])])

    # Only for scaled distributions
[docs]    @staticmethod
    def make_conditional_respecting(header, row, limits):
        low = high = None
        if 0 in header:
            low = float(row[header.index(0)])
            header.remove(header.index(0))
        if 1 in header:
            high = float(row[header.index(1)])
            header.remove(header.index(1))

        if low is not None and high is not None:
            model = FeaturesInterpreter.make_conditional(header, row, (low, high))
            model.add_segment(limits[0], low, [SplineModel.neginf])
            model.add_segment(high, limits[1], [SplineModel.neginf])

            return model

        if low is not None:
            model = FeaturesInterpreter.make_conditional(header, row, (low, limits[1]))
            model.add_segment(limits[0], low, [SplineModel.neginf])

            return model

        if high is not None:
            model = FeaturesInterpreter.make_conditional(header, row, (limits[0], high))
            model.add_segment(high, limits[1], [SplineModel.neginf])

            return model

        return FeaturesInterpreter.make_conditional(header, row, limits)

    # Only for scaled distributions
[docs]    @staticmethod
    def make_conditional(header, row, limits):
        # Look for a special case
        conditional = FeaturesInterpreter.features_to_gaussian(header, row, limits)
        if conditional is not None:
            return conditional

        conditional = FeaturesInterpreter.features_to_exponential(header, row, limits)
        if conditional is not None:
            return conditional

        conditional = FeaturesInterpreter.features_to_uniform(header, row, limits)
        if conditional is not None:
            return conditional

        spline = FeaturesInterpreter.best_spline(header, row, limits)
        conditional = SplineModelConditional.make_conditional_from_spline(spline, limits)
        return conditional.rescale()

[docs]    @staticmethod
    def best_knot(knots, newknots):
        """Find the knot furthest from existing knots"""
        if len(knots) == 0:
            return newknots[0]

        scores = np.zeros(len(newknots))
        for ii in range(len(newknots)):
            scores[ii] = min(abs(np.array(knots) - newknots[0]))
        return newknots[np.argmax(scores)]

[docs]    @staticmethod
    def best_spline(header, row, limits):
        print "Best Spline"
        # Do general solution
        best_ys = []
        best_lps = []
        for ii in range(1, len(header)):
            if isinstance(header[ii], float):
                best_ys.append(float(row[ii]))
                best_lps.append(-6*abs(.5 - header[ii]))
            elif header[ii] == 'mode':
                best_ys.append(float(row[ii]))
                best_lps.append(0)
            elif header[ii] == 'var':
                if 'mean' in header:
                    best_ys.append(FeaturesInterpreter.best_knot(best_ys, [float(row[header.index('mean')]) + math.sqrt(float(row[ii])),
                                                       float(row[header.index('mean')]) - math.sqrt(float(row[ii]))]))
                else:
                    best_ys.append(FeaturesInterpreter.best_knot(best_ys, [math.sqrt(float(row[ii])),
                                                       -math.sqrt(float(row[ii]))]))
                best_lps.append(-1)
            elif header[ii] == 'skew':
                if 'var' in header and 'mean' in header:
                    best_ys.append(FeaturesInterpreter.best_knot(best_ys, [float(row[header.index('mean')]) +
                                                       math.sqrt(float(row[header.index('var')])) * math.pow(float(row[ii]), 1.0/3)]))
                else:
                    best_ys.append(FeaturesInterpreter.best_knot(best_ys, [math.pow(float(row[ii]), 1.0/3)]))
                best_lps.append(-1.5)
            elif header[ii] == 'mean' and 'var' not in header and 'skew' not in header:
                best_ys.append(FeaturesInterpreter.best_knot(best_ys, [float(row[ii])]))
                best_lps.append(0)

        indexes = [elt[0] for elt in sorted(enumerate(best_ys), key=lambda elt: elt[1])]
        best_ys = [best_ys[index] for index in indexes]
        best_lps = [best_lps[index] for index in indexes]

        best_spline = InterpolatedUnivariateSpline(best_ys, best_lps, k=2)
        best_error = FeaturesInterpreter.evaluate_spline(header, row, best_spline, limits)

        # Search space for spline that fits criteria
        print "Searching..."
        for attempt in range(100):
            ys = best_ys + np.random.normal(0, max(best_ys) - min(best_ys), len(best_ys))
            lps = best_lps + np.random.normal(0, max(best_lps) - min(best_lps) + 1, len(best_lps))
            ys = ys.tolist()
            lps = lps.tolist()

            # I use 1 - random() here because random() range is [0, 1).
            if limits[0] == SplineModel.neginf:
                ys.insert(0, ys[0] - 1/(1 - random.random()))
                lps.insert(0, -7)
            elif limits[0] < ys[0]:
                ys.insert(0, limits[0])
                lps.insert(0, -6 - 1/(1 - random.random()))

            if limits[1] == SplineModel.posinf:
                ys.insert(0, ys[-1] + 1/(1 - random.random()))
                lps.insert(0, -7)
            elif limits[1] > ys[-1]:
                ys.insert(0, limits[1])
                lps.insert(0, -6 - 1/(1 - random.random()))

            spline = InterpolatedUnivariateSpline(ys, lps, k=2)
            error = FeaturesInterpreter.evaluate_spline(header, row, spline, limits)

            if error < best_error:
                best_spline = spline
                best_error = error

        return best_spline

[docs]    @staticmethod
    def evaluate_spline(header, row, spline, limits):
        limits = (max(min(spline.get_knots()), float(limits[0])), min(max(spline.get_knots()), float(limits[1])))

        ys = np.linspace(limits[0], limits[1], len(header) * SplineModel.samples)
        ps = np.exp(spline(ys)) * (limits[1] - limits[0]) / (len(header) * SplineModel.samples)
        ps = ps / sum(ps)
        cfs = np.cumsum(ps)

        if 'mean' in header or 'var' in header or 'skew' in header:
            mean = sum(ps * ys)
        if 'var' in header or 'skew' in header:
            var = sum(ps * np.square(ys - mean))

        error = 0
        for ii in range(1, len(header)):
            if isinstance(header[ii], float):
                error = error + np.abs(SplineModelConditional.find_nearest(cfs, header[ii], ys) - float(row[ii]))
            elif header[ii] == 'mean':
                error = error + np.abs(mean - float(row[ii]))
            elif header[ii] == 'mode':
                mode = ys[ps.argmax()]
                error = error + np.abs(mode - float(row[ii]))
            elif header[ii] == 'var':
                error = error + np.sqrt(np.abs(var - float(row[ii])))
            elif header[ii] == 'skew':
                skew = sum(ps * np.pow((ys - mean) / sqrt(var), 3))
                error = error + np.pow(np.abs(skew - float(row[ii])), 1.0/3)

        return error