# -*- coding: utf-8 -*-
################################################################################
# Copyright 2014, The Open Aggregator
# GNU General Public License, Ver. 3 (see docs/license.txt)
################################################################################
"""Mean-Size Model
In Mean-Size models, each point is characterized only by a value and the population size that went into estimating that value. As such, it does not have enough information to generate a full distribution. It can be safely combined with other mean-size models, or approximated with a Gaussian (with a variance which is equal to the absolute value of the mean for size = 1, and a variance that decreases with the square root of the size, according to the Central Limit Theorem).
The format is::
msx1,mean,size
<x0>,<mean0>,<size0>
<x1>,<mean1>,<size1>
...
"""
__copyright__ = "Copyright 2014, The Open Aggregator"
__license__ = "GPL"
__author__ = "James Rising"
__credits__ = ["James Rising", "Amir Jina"]
__maintainer__ = "James Rising"
__email__ = "jar2234@columbia.edu"
__status__ = "Production"
__version__ = "$Revision$"
# $Source$
import math, csv
from scipy.interpolate import interp1d
import numpy as np
from model import Model, Attribute
from univariate_model import UnivariateModel
[docs]class MeanSizeModel(UnivariateModel):
def __init__(self, xx_is_categorical=False, xx=None, means=None, sizes=None):
super(MeanSizeModel, self).__init__(xx_is_categorical, xx, True)
self.means = means
self.sizes = sizes
[docs] def kind(self):
return 'mean_size_model'
[docs] def copy(self):
return MeanSizeModel(self.xx_is_categorical, list(self.get_xx()), list(self.means), list(self.sizes))
[docs] def scale_y(self, a):
self.means = map(lambda m: m * a, self.means)
return self
[docs] def scale_p(self, a):
self.sizes = map(lambda s: s * a, self.sizes)
return self
[docs] def get_mean(self, x=None):
return self.means[self.get_xx().index(x)]
[docs] def get_sdev(self, x=None):
index = self.get_xx().index(x)
return abs(self.means[index]) / math.sqrt(self.sizes[index])
[docs] def filter_x(self, xx):
return MeanSizeModel(self.xx_is_categorical, xx, map(lambda x: self.means[xx.index(x)], xx), map(lambda x: self.sizes[xx.index(x)], xx))
[docs] def interpolate_x(self, newxx, kind='quadratic'):
fx = interp1d(self.xx, self.means, kind)
means = fx(newxx)
fx = interp1d(self.xx, self.sizes, kind)
sizes = fx(newxx)
return MeanSizeModel(self.xx_is_categorical, newxx, means, sizes)
[docs] def attribute_list(self):
return ["sizes"]
[docs] def get_attribute(self, title):
if title == "sizes":
if len(self.sizes) == 1:
return Attribute("Sample Sizes", None, None, None, self.sizes, None, None)
else:
return [Attribute("Sample Sizes", None, None, self.get_xx()[ii], self.sizes[ii], None, None) for ii in range(len(self.sizes))]
raise title + " not available"
[docs] def write_file(self, filename, delimiter):
with open(filename, 'w') as fp:
self.write(fp, delimiter)
[docs] def write(self, file, delimiter):
file.write("msx1" + "\n")
for ii in range(len(self.xx)):
file.write(delimiter.join(map(str, [self.get_xx()[ii], self.means[ii], self.sizes[ii]])) + "\n")
[docs] def init_from_mean_size_file(self, file, delimiter, status_callback=None):
reader = csv.reader(file, delimiter=delimiter)
header = reader.next()
if header[0] != "msx1":
raise ValueError("Unknown format: %s" % (fields[0]))
xx_text = []
xx = []
means = []
sizes = []
self.xx_is_categorical = False
for row in reader:
xx_text.append(row[0])
try:
xx.append(float(row[0]))
except ValueError:
xx.append(len(xx))
self.xx_is_categorical = True
means.append(float(row[1]))
sizes.append(float(row[2]))
self.xx = xx
self.xx_text = xx_text
self.means = means
self.sizes = sizes
[docs] @staticmethod
def merge(models, treatment="default"):
if treatment == 'default':
if 'treated' in models[0].get_xx() and 'control' in models[0].get_xx():
treatment = 'treated-even'
else:
treatment = 'independent'
if treatment == "independent":
# Collect the union of all x values
masterxx = set()
for model in models:
masterxx.update(model.get_xx())
masterxx = list(masterxx)
means = []
sizes = []
for ii in range(len(masterxx)):
print masterxx[ii]
numersum = 0
denomsum = 0
for model in models:
xx = model.get_xx()
try:
jj = xx.index(masterxx[ii])
numersum += model.means[jj] * model.sizes[jj]
denomsum += model.sizes[jj]
except:
pass
print numersum, denomsum
means.append(numersum / float(denomsum))
sizes.append(denomsum)
return MeanSizeModel(models[0].xx_is_categorical, masterxx, means, sizes)
else:
# All need to have the same x-values
numersum = 0
denomsum = 0
for model in models:
ii = model.get_xx().index('treated')
jj = model.get_xx().index('control')
if treatment == 'treated-even':
avgsize = 1
else:
avgsize = (model.sizes[ii] + model.sizes[jj]) / 2
numersum += (model.means[ii] - model.means[jj]) * avgsize
denomsum += avgsize
means = [numersum / float(denomsum)]
sizes = [denomsum]
return MeanSizeModel(models[0].xx_is_categorical, ["difference"], means, sizes)
[docs] @staticmethod
def combine(one, two):
if one.xx_is_categorical != two.xx_is_categorical:
raise ValueError("Cannot combine models that do not agree on categoricity")
(one, two, xx) = UnivariateModel.intersect_x(one, two)
means = np.array(one.means) + np.array(two.means)
sizes = 1 / (1 / np.sqrt(np.array(one.sizes)) + 1 / np.sqrt(np.array(two.means)))**2
return MeanSizeModel(one.xx_is_categorical, xx, means, sizes)
Model.mergers["mean_size_model"] = MeanSizeModel.merge
Model.combiners['mean_size_model+mean_size_model'] = MeanSizeModel.combine