#! /usr/bin/env python
#
# Copyright (C) 2019 Data Engines Corporation.
#
# Author: Dr. Andres Corrada-Emmanuel
#
#
# This code computes the deltas between regressors needed to
# estimate their relative regressor bias.
#
# Input: A Python CSV Reader object for the aligned regressor estimates.
# Each row in your CSV should represent a point in your dataset.
# Each column is the estimate of one of the regressors in
# your ensemble of regressors.
# Output: The average difference between all the possible pairs,
# Expressed as one long string with commas separating
# the values. You can submit the string to the online
# app and you'll get a report on the L1-minimizing
# relative bias between your regressors.
#
# Sanity Check: If your CSV has the estimates of n regressors,
# the output file should have n*(n-1)/2 entries.
# WARNING: Do not change the order of the output deltas. The
# online app assumes that they are in the same order
# as the i,j for loops used in the code.
def calculate_pair_deltas(csv_rdr):
# We pop the first line to establish the
# the number of regressors. The online
# app can compute for up to 10 regressors.
row = next(csv_rdr)
# How many regressors are there?
n = len(row)
# Build a dict from the index pair to the
# current running sum, and total objects
# seen. This is a data streaming way of
# calculating the mean of a possibly very
# large stream. At the end we calculate
# the mean and throw away the size of your
# dataset. This is a nice feature of ground
# truth inference algorithms, they are
# private. We only need a very compressed
# set of numbers, the number of regressor
# pairs, to compute useful statistics about
# your data. This is very similar to the
# simple way of calculating the mean for
# a stream of numbers.
deltas = {pair:[0.0,0] for pair in [(i,j) for i in range(n) for j in range(i,n)]}
deltas_stats = process_row(row, deltas)
for row in csv_rdr:
deltas_stats = process_row(row, deltas_stats)
return deltas_from_deltas_stats(deltas_stats)
def process_row(row, deltas):
float_values = [float(val) for val in row]
for pair in deltas:
# Calculate the delta for this pair
delta = float_values[pair[0]] - float_values[pair[1]]
deltas[pair][0] += delta
deltas[pair][1] += 1
return deltas
def deltas_from_deltas_stats(deltas_stats):
average_deltas = []
for pair in sorted(deltas_stats.keys()):
(sum_deltas, n) = deltas_stats[pair]
# This is the step where we throw away
# the size of your dataset. We just
# want the mean delta observation.
# Ground truth inference algorithms
# have this privacy protecting feature.
average_deltas.append(sum_deltas/n)
deltas = ['{:g}'.format(delta) for delta in average_deltas]
return deltas
if __name__ == "__main__":
import csv
import sys
fp = open(sys.argv[1])
csv_reader = csv.reader(fp, delimiter="\t")
deltas = calculate_pair_deltas(csv_reader)
for delta in deltas:
print(delta)