Commit 9968f8a6 authored by Laurel Orr's avatar Laurel Orr
Browse files

Really Everything now

parent 3f6b02b5
This diff is collapsed.
import pandas as pd
import numpy as np
from sklearn import linear_model
import matplotlib
import matplotlib.pyplot as plt
import os
import sys
import pickle
import itertools
import re
import psycopg2
import argparse
import datetime
import time
def parse_index(index_string):
return np.array(list(map(lambda x: list(map(lambda y: y == "1", x)), index_string.split("_"))))
def connect_db(database, host):
conn = psycopg2.connect(database=database, user = "ljorr1", password = "password", host = host, port = "5432")
conn.set_session(isolation_level="SERIALIZABLE", autocommit=False)
return conn
def index_to_numeric(index_set, column_index):
numeric_index = []
for idx in index_set:
# +1 for the indexes to match R
numeric_index.append(column_index[idx])
return(np.array(numeric_index))
def index_to_attr(index_set, column_index, columns):
attr_index = []
for idx in index_set:
# +1 for the indexes to match R
attr_index.append(columns[column_index[idx]-1])
return(np.array(attr_index))
def index_to_bool(index_set, column_index):
boolean_index = []
for idx in index_set:
temp = np.array([False]*len(column_index))
temp[column_index.find(idx)] = True
boolean_index.append(temp)
return(np.array(boolean_index))
def get_columns(cur, query_hist_table, column_index):
cur.execute("SELECT ordinal_position, column_name FROM information_schema.columns WHERE table_name='{:s}';".format( query_hist_table))
res = cur.fetchall()
columns = [t[1] for t in res if t[0] in column_index]
return columns
def point_query(cur, attrs, tup, table):
where_cond = ["{:s}::text = '{:s}'".format(a, tup[a]) for a in attrs]
q = "SELECT SUM(weight) AS cnt FROM {:s} WHERE {:s}".format(table, " AND ".join(where_cond))
cur.execute(q)
ans = cur.fetchall()
return ans
def execute_sample_topk(cur, conn, relation, attrs, sample, limit):
sorted_attrs = sorted(attrs)
selection = map(lambda x: "replace(({0:s})::text,'''','''''') AS {0:s}".format(x), sorted_attrs)
outer_join_selection = map(lambda x: "replace(t_{0:s}.{0:s}::text,'''','''''') AS {0:s}".format(x), sorted_attrs)
order_by_at_end = map(lambda x: "'{0:s}.' || replace(({0:s})::text,'''','''''') ASC".format(x), sorted_attrs)
outer_join_order_by_at_end = map(lambda x: "'{0:s}.' || replace(t_{0:s}.{0:s}::text,'''','''''') ASC".format(x), sorted_attrs)
print("EXECUTING...")
query_str = "SELECT {:s}, 1.0*SUM(weight) AS weight FROM {:s} GROUP BY {:s} ORDER BY SUM(weight) DESC, {:s} LIMIT {:d};".format(", ".join(selection), sample, ", ".join(attrs), ", ".join(order_by_at_end), limit)
print(query_str)
cur.execute(query_str)
conn.commit()
tups = cur.fetchall()
return(tups)
def execute_true_topk(cur, conn, relation, attrs, query_hist_table, limit):
sorted_attrs = sorted(attrs)
selection = map(lambda x: "replace(({0:s})::text,'''','''''') AS {0:s}".format(x), sorted_attrs)
outer_join_selection = map(lambda x: "replace(t_{0:s}.{0:s}::text,'''','''''') AS {0:s}".format(x), sorted_attrs)
order_by_at_end = map(lambda x: "'{0:s}.' || replace(({0:s})::text,'''','''''') ASC".format(x), sorted_attrs)
outer_join_order_by_at_end = map(lambda x: "'{0:s}.' || replace(t_{0:s}.{0:s}::text,'''','''''') ASC".format(x), sorted_attrs)
print("EXECUTING...")
query_str = "SELECT {:s}, 1.0*SUM(cnt) AS cnt FROM {:s} GROUP BY {:s} ORDER BY SUM(cnt) DESC, {:s} LIMIT {:d};".format(", ".join(selection), query_hist_table, ", ".join(attrs), ", ".join(order_by_at_end), limit)
print(query_str)
cur.execute(query_str)
conn.commit()
tups = cur.fetchall()
return(tups)
def generate_test_tups(cur, conn, relation, attrs, q_order, query_hist_table, limit, special_q):
sorted_attrs = sorted(attrs)
selection = map(lambda x: "replace(({0:s})::text,'''','''''') AS {0:s}".format(x), sorted_attrs)
outer_join_selection = map(lambda x: "replace(t_{0:s}.{0:s}::text,'''','''''') AS {0:s}".format(x), sorted_attrs)
order_by_at_end = map(lambda x: "'{0:s}.' || replace(({0:s})::text,'''','''''') ASC".format(x), sorted_attrs)
outer_join_order_by_at_end = map(lambda x: "'{0:s}.' || replace(t_{0:s}.{0:s}::text,'''','''''') ASC".format(x), sorted_attrs)
if (q_order == "ASC"):
print("EXECUTING...")
query_str = "SELECT {:s}, 1.0*SUM(cnt) AS cnt FROM {:s} GROUP BY {:s} ORDER BY SUM(cnt) ASC, {:s} LIMIT {:d};".format(", ".join(selection), query_hist_table, ", ".join(attrs), ", ".join(order_by_at_end), limit)
print(query_str)
cur.execute(query_str)
conn.commit()
tups = cur.fetchall()
elif (q_order == "DESC"):
print("EXECUTING...")
query_str = "SELECT {:s}, 1.0*SUM(cnt) AS cnt FROM {:s} GROUP BY {:s} ORDER BY SUM(cnt) DESC, {:s} LIMIT {:d};".format(", ".join(selection), query_hist_table, ", ".join(attrs), ", ".join(order_by_at_end), limit)
print(query_str)
cur.execute(query_str)
conn.commit()
tups = cur.fetchall()
elif (q_order == "RAND"):
print("EXECUTING (SETTING SEED)...")
query_str = "SELECT {:s}, 1.0*SUM(cnt) AS cnt FROM {:s} GROUP BY {:s} ORDER BY random() LIMIT {:d};".format(", ".join(selection), query_hist_table, ", ".join(attrs), limit)
print(query_str)
cur.execute("SELECT setseed(0.5);")
cur.execute(query_str)
conn.commit()
tups = cur.fetchall()
elif (q_order == "NONEXIST"):
print("EXECUTING (SETTING SEED)...")
if special_q:
query_str = "SELECT {0:s}, cnt FROM {1:s}_nonexistall ORDER BY random() LIMIT 100;".format(", ".join(selection), relation)
else:
cross_prod = ["(SELECT {0:s} FROM {1:s}_bin_constants_{0:s}) as t_{0:s}".format(attr, relation) for attr in attrs]
outer_join = ["t_{0:s}.{0:s} = {1:s}.{0:s}".format(attr, query_hist_table)for attr in attrs]
where = ["{:s}.{:s} IS NULL".format(query_hist_table, attr) for attr in attrs]
query_str = "SELECT * FROM (SELECT {0:s}, 0.0 as cnt FROM {1:s} LEFT OUTER JOIN (SELECT DISTINCT {2:s} FROM {3:s}) as {3:s} ON ({4:s}) WHERE {5:s} ORDER BY {6:s}) as temp ORDER BY random() LIMIT {7:d}".format(", ".join(outer_join_selection), " CROSS JOIN ".join(cross_prod), ", ".join(attrs), query_hist_table, " AND ".join(outer_join), " OR ".join(where), ", ".join(outer_join_order_by_at_end), limit)
print(query_str)
# These should be in the same order for all methods because it's the same query
cur.execute("SELECT setseed(0.5);")
cur.execute(query_str)
conn.commit()
tups = cur.fetchall()
else:
print("Query Order ", q_order, " is not recognized. Try again from ASC, DESC, or NONEXIST.")
sys.exit(0)
return(tups)
def main(args):
parser = argparse.ArgumentParser(description='Use linear regression to de-bias a sample.')
parser.add_argument("--host", type=str, default="",
help = "What ip address db running on or string localhost")
parser.add_argument("--id", type=str, default="",
help = "Adds id to output file paths for differentiation")
parser.add_argument("--data_path", type=str, default="",
help = "Path without trailing toupper(relation), where data is located.")
parser.add_argument("--plot_path", type=str, default="",
help = "Path without trailing toupper(relation), where data is located.")
parser.add_argument("--global_itr", type=int, default = 0,
help = "An iteration number in case you want multiple iterations of same run")
parser.add_argument("--db", type=str, default="",
help = "Use given database")
parser.add_argument("--relation", type=str, default="",
help = "Use given relation/Bayesian network name. Must have <relation>_rand_hist in database.")
parser.add_argument("--sample_type", type=str, default="",
help = "Sample type to use (unif, bias)")
parser.add_argument("--sample_percent", type=float, default = 0,
help = "Sample percentage)")
parser.add_argument("--test_selection_size", type=int, default = 0,
help = "the number of tuples to choose for each order for testing")
parser.add_argument("--tci", type=str, default="",
help = "list of test column indexes (starting at 1, separated by ',' or '_' for range) to include in summary (default is all)")
parser.add_argument("--ti", type=str, default="",
help = "list of test indexes to use for tests from column indexes (in binary), separated by '_' (ex: 011_101)")
parser.add_argument("--topkti", type=str, default="",
help = "list of test indexes to use for tests from column indexes (in binary), separated by '_' (ex: 011_101)")
print("Received arguments", args)
opt = parser.parse_args(args)
if (opt.host == "" or opt.global_itr == 0 or opt.db == "" or opt.relation == "" or opt.sample_type == "" or opt.sample_percent == 0 or opt.test_selection_size == 0 or (opt.ti == "" and opt.topkti == "")):
print("You had an option be empty! Make sure all of these are not empty or 0 or -1.")
print("opt.host", opt.host)
print("opt.global_itr", opt.global_itr)
print("opt.db", opt.db)
print("opt.relation", opt.relation)
print("opt.sample_type", opt.sample_type)
print("opt.sample_percent", opt.sample_percent)
print("opt.test_selection_size", opt.test_selection_size)
print("opt.tci", opt.ci)
print("opt.ti and opt.topkti", opt.ti, opt.topkti)
sys.exit(0)
database = opt.db
host = opt.host
print("Host", host)
print("DB", database)
conn = connect_db(database, host)
cur = conn.cursor()
limit = opt.test_selection_size
relation = opt.relation
column_index = np.array(list(map(lambda x: int(x), opt.tci.split(","))))
num_nodes = len(column_index)
query_hist_table = relation + "_rand_hist"
columns = np.array(get_columns(cur, query_hist_table, column_index))
q_orders = ["ASC", "DESC", "RAND"]
path = opt.data_path
if path[-1] != "/":
path += "/"
path += relation.upper() + "/"
plot_path = opt.data_path
if plot_path[-1] != "/":
plot_path += "/"
test_index_set = parse_index(opt.ti)
topk_index_set = parse_index(opt.topkti)
sample_type = opt.sample_type
sample_percent = opt.sample_percent
global_itr = opt.global_itr
description = opt.id
ending = "{:s}_{:s}_smp{:s}_perc{:.2f}_itr{:d}".format(relation, description, sample_type, sample_percent, global_itr)
ending_sample = "sample_{:s}_{:s}_perc{:.2f}_itr{:d}".format(relation, sample_type, sample_percent, global_itr)
# Note that an order by random of a different machine, even after the seed has been set will NOT result in the same answer!
cur.execute("SELECT ordinal_position, column_name FROM information_schema.columns WHERE table_name='{:s}';".format(query_hist_table))
universe_attrs = np.array([t[1] for t in cur.fetchall() if t[0] in column_index])
responses_attrs = ["Id", "Ending", "Join", "QAttrs", "NumQAttrs", "DBRuntime", "Order", "TrueModel", "DBModel"]
##################
### COUNT
##################
responses = pd.DataFrame(columns=responses_attrs)
for test_index in test_index_set:
if len(test_index) == 0:
continue
num_per_gb = 0
special_q = False
# must sort so the join key matches the sort order of BN
test_attrs = sorted(list(universe_attrs[test_index]))
# print("Running query on", test_attrs)
temp_responses_agg = pd.DataFrame(columns=responses_attrs)
for q_order in q_orders:
special_q = False
if (len(test_attrs) == len(test_index)) and q_order == "NONEXIST":
special_q = True
temp_responses = pd.DataFrame(generate_test_tups(cur, conn, relation, test_attrs, q_order, query_hist_table, limit, special_q), columns = test_attrs+["TrueModel"])
if temp_responses.shape[0] == 0:
continue
for i in range(temp_responses.shape[0]):
tup = temp_responses.iloc[i,:]
start_time = time.time()
ans = point_query(cur, test_attrs, tup, ending_sample.replace(".", "_"))
end_time = time.time()
ans = ans[0][0]
if ans is None:
ans = 0
else:
ans = float(ans)
# print("ANS", ans)
temp_responses.loc[i,"DBModel"] = ans
temp_responses.loc[i,"DBRuntime"] = end_time-start_time
temp_responses["Order"] = q_order
temp_responses["QAttrs"] = "_".join(sorted(test_attrs))
temp_responses["NumQAttrs"] = len(test_attrs)
temp_responses["Join"] = temp_responses[test_attrs].apply(lambda x: " & ".join(["({:s} == \'{:s}\')".format(grp, x[grp].replace("\'\'", "")) for grp in test_attrs]), axis=1)
temp_responses.drop(test_attrs, axis=1, inplace=True)
temp_responses_agg = pd.concat([temp_responses_agg, temp_responses], axis=0, sort=True)
responses = pd.concat([responses, temp_responses_agg], axis=0, sort=True)
responses["Ending"] = ending
responses["Id"] = description
if len(test_index_set[0]) > 0:
responses = responses[responses_attrs]
responses.to_csv("{:s}rw_runtime_count_{:s}.csv".format(path, ending), index=False)
##################
### TOP K
##################
responses = pd.DataFrame(columns=responses_attrs)
for test_index in topk_index_set:
if len(test_index) == 0:
continue
num_per_gb = 0
# must sort so the join key matches the sort order of BN
test_attrs = sorted(list(universe_attrs[test_index]))
# print("Running query on", test_attrs)
temp_responses = pd.DataFrame(columns=responses_attrs)
true_responses = pd.DataFrame(execute_true_topk(cur, conn, relation, test_attrs, query_hist_table, limit), columns = test_attrs+["TrueModel"])
start_time = time.time()
db_responses = pd.DataFrame(execute_sample_topk(cur, conn, relation, test_attrs, ending_sample.replace(".", "_"), limit), columns = test_attrs+["DBModel"])
end_time = time.time()
temp_responses["Order"] = "DESC"
temp_responses["QAttrs"] = "_".join(test_attrs)
temp_responses["NumQAttrs"] = len(test_attrs)
temp_responses["TrueModel"] = true_responses[test_attrs].apply(lambda x: " & ".join(["({:s} == \'{:s}\')".format(grp, x[grp]) for grp in test_attrs]), axis=1)
temp_responses["DBModel"] = db_responses[test_attrs].apply(lambda x: " & ".join(["({:s} == \'{:s}\')".format(grp, x[grp]) for grp in test_attrs]), axis=1)
temp_responses["DBRuntime"] = end_time-start_time
temp_responses["Ending"] = ending
temp_responses["Id"] = description
responses = pd.concat([responses, temp_responses], axis=0, sort=True)
if len(test_index_set[0]) > 0:
responses = responses[responses_attrs]
responses.to_csv("{:s}rw_runtime_topk_{:s}.csv".format(path, ending), index=False)
conn.close()
if __name__ == '__main__':
main(sys.argv[1:])
\ No newline at end of file
import pandas as pd
import numpy as np
from sklearn import linear_model
import matplotlib
import matplotlib.pyplot as plt
import os
import sys
import pickle
import itertools
import re
import psycopg2
import argparse
import datetime
from termcolor import colored
tableau20 = [(31, 119, 180), (174, 199, 232),
(44, 160, 44), (152, 223, 138),
(255, 127, 14), (255, 187, 120),
(214, 39, 40), (255, 152, 150),
(148, 103, 189), (197, 176, 213),
(140, 86, 75), (196, 156, 148),
(227, 119, 194), (247, 182, 210),
(127, 127, 127), (199, 199, 199),
(188, 189, 34), (219, 219, 141),
(23, 190, 207), (158, 218, 229)]
# Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.
for i in range(len(tableau20)):
r, g, b = tableau20[i]
tableau20[i] = (r / 255., g / 255., b / 255.)
def sanitize_and_check_description(file):
file_in = open(file, "r")
descrp = {}
for line in file_in:
line = line.strip()
if line == "":
continue
form,k,statistic_string,dateti = line.split(",")
dateti = datetime.datetime.strptime(dateti, "%a %b %d %X %Y")
if k not in descrp:
descrp[k] = {form: [form,k,statistic_string,dateti]}
else:
form_dict2 = descrp[k]
# overwrite with most recent output by timestamp
if form in form_dict2:
[form2,k2,statistic_string2,dateti2] = form_dict2[form]
if dateti2 < dateti:
descrp[k][form] = [form,k,statistic_string,dateti]
# make sure the stat strings are the same
else:
for form2 in form_dict2:
if form_dict2[form2][2] != statistic_string:
print(colored("WARNIGN!!!", "red"))
print(form_dict2[form2][2], "!=", statistic_string)
print("The ending", k, " has different statistic_strings. The runs are matched by ending string and should have the same statistic string.")
print("The existing descriptions are")
print(descrp[k])
print("You want to add")
print([form,k,statistic_string,dateti])
sys.exit(0)
descrp[k][form] = [form,k,statistic_string,dateti]
file_in.close()
file_in = open(file, "w")
for k in descrp:
for form in descrp[k]:
[form,k,statistic_string,dateti] = descrp[k][form]
file_in.write("{:s},{:s},{:s},{:s}\n".format(form, k, statistic_string, dateti.strftime("%a %b %d %X %Y")))
file_in.close()
def parse_index(index_string):
return np.array(list(map(lambda x: list(map(lambda y: y == "1", x)), index_string.split("_"))))
def index_to_numeric(index_set):
numeric_index = []
for idx in index_set:
# +1 for the indexes to match R
numeric_index.append(np.where(idx)[0] + 1)
return(np.array(numeric_index))
def index_to_bool(index_set, length):
boolean_index = []
for idx in index_set:
temp = np.array([False]*length)
temp[idx] = False
boolean_index.append(temp)
return(np.array(boolean_index))
def powerset(iterable):
# powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)
s = list(iterable)
return itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(len(s)+1))
def main(args):
parser = argparse.ArgumentParser(description='Use linear regression to de-bias a sample.')
parser.add_argument("--id", type=str, default="",
help = "Adds id to output file paths for differentiation")
parser.add_argument("--data_path", type=str, default="project_data_as_samples/Data/BNLearn/",
help = "Path without trailing toupper(relation), where data is located.")
parser.add_argument("--global_itr", type=int, default = 0,
help = "An iteration number in case you want multiple iterations of same run")
parser.add_argument("--relation", type=str, default="",
help = "Use given relation/Bayesian network name. Must have <relation>_rand_hist in database.")
parser.add_argument("--sample_type", type=str, default="",
help = "Sample type to use (unif, bias)")
parser.add_argument("--sample_percent", type=float, default = 0,
help = "Sample percentage)")
print("Received arguments", args)
opt = parser.parse_args(args)
if (opt.id == "" or opt.global_itr == 0 or opt.relation == "" or opt.sample_type == "" or opt.sample_percent == 0):
print("You had an option be empty! Make sure all of these are not empty or 0 or -1.")
print("opt.id", opt.id)
print("opt.global_itr", opt.global_itr)
print("opt.relation", opt.relation)
print("opt.sample_type", opt.sample_type)
print("opt.sample_percent", opt.sample_percent)
sys.exit(0)
bn_name = opt.relation
q_orders = ["ASC", "DESC", "RAND", "NULL"]
models = ["RW", "BN", "ME"]
path = opt.data_path
if path[-1] != "/":
path += "/"
path += bn_name.upper() + "/"
plot_path = "project_data_as_samples/Code/Plots/"
description = opt.id
sample_type = opt.sample_type
sample_percent = opt.sample_percent
global_itr = opt.global_itr
dim = int(description.split("_")[1][1:])
print("Using dimension", dim)
ending = "{:s}_{:s}_smp{:s}_perc{:.2f}_itr{:d}".format(bn_name, description, sample_type, sample_percent, global_itr)
print("Ending", ending)
# This is also checked when we do the join. Technically, we could remove this, but it's nice to remove extraneous lines
# from the file.
sanitize_and_check_description("{:s}{:s}".format(path, "description_map.csv"))
to_merge = []
dtype_dict = {"Id": "str", "Ending": "str", "StatisticString": "str", "NumStats": "int", "NumDim": "int", "Join": "str", "Order": "str", "NumQAttrs": "int", "QAttrs": "str", "LinRWModel": "float", "BNRuntime": "float", "RWRuntime": "float", "BNModel": "float", "BNRWModel": "float", "MERuntime": "float", "MEModel": "float", "MERWModel": "float", "UnifRWModel": "float", "TrueModel": "float", "UnifRW2Model": "float", "True2Model": "float", "UnifRW3Model": "float", "True3Model": "float"}
dtype_dict2 = {"id":"str","ending":"str","statisticstring":"str","num_stats":"int","num_dim":"int","q":"str","q_order":"str","num_q_attrs":"int","q_attrs":"str","q_attrs_idx":"str","bn_runtime":"float","est_bn":"float","est_bn_rw":"float","est_unif_rw":"float","true":"float"}
print("Trying to read")
print("{:s}rw_runtime_{:s}.csv".format(path, ending))
print("{:s}bn_errors_{:s}.csv".format(path, ending))
print("{:s}maxent_errors_{:s}.csv".format(path, ending))
if not os.path.exists("{:s}bn_errors_{:s}.csv".format(path, ending)):
print("The BN file does not exist for ending", ending)
print("Exiting...")
sys.exit()
else:
to_merge.append(pd.read_csv("{:s}bn_errors_{:s}.csv".format(path, ending), header=0, names=["Id","Ending","StatisticString","Join","NumStats","NumDim","NumQAttrs","QAttrs","QAttrsIdx","Order","BNRuntime","BNModel","BNRWModel","UnifRW2Model","True2Model"]))
print("BN COLS", to_merge[-1].columns)
to_merge[-1] = to_merge[-1][to_merge[-1]["Order"] == "DESC"]
if not os.path.exists("{:s}maxent_errors_{:s}.csv".format(path, ending)):
print("The maxent file does not exist for ending", ending)
print("Exiting...")
sys.exit()
else:
to_merge.append(pd.read_csv("{:s}maxent_errors_{:s}.csv".format(path, ending), header=0, dtype=dtype_dict, names=["Id","Ending","StatisticString","Join","NumStats","NumDim","NumQAttrs","QAttrs","Order","MERuntime","True3Model","UnifRW3Model","MERWModel","MEModel"]))
print("Max Ent COLS", to_merge[-1].columns)
to_merge[-1] = to_merge[-1][to_merge[-1]["Order"] == "DESC"]
if not os.path.exists("{:s}rw_runtime_{:s}.csv".format(path, ending)):
print("The reweight file does not exist for ending", ending)
print("Exiting...")
sys.exit()
else:
to_merge.append(pd.read_csv("{:s}rw_runtime_{:s}.csv".format(path, ending), header=0, dtype=dtype_dict, names=["Id", "Ending", "Join", "QAttrs", "NumQAttrs", "RWRuntime", "Order", "TrueModel", "DBModel"]))
print("RW COLS", to_merge[-1].columns)
to_merge[-1] = to_merge[-1][to_merge[-1]["Order"] == "DESC"]
model_responses = pd.merge(to_merge[0], to_merge[1], how='inner', on=["Id", "Ending", "StatisticString", "NumDim", "NumStats", "Join", "Order", "NumQAttrs", "QAttrs"])
print("MODEL RESPONSES 1 SHAPE", model_responses.shape[0])
for i in range(2,len(to_merge)):
model_responses = pd.merge(model_responses, to_merge[i], how='outer', on=["Id", "Ending", "Join", "Order", "NumQAttrs", "QAttrs"])
bad_shape = False
for for_merge in to_merge:
print("PRE LEN", for_merge.shape[0])
if for_merge.shape[0] != model_responses.shape[0]:
bad_shape = True
print("POST LEN", model_responses.shape[0])
if bad_shape:
model_responses.to_csv("~/Desktop/bad_model_responses.csv", index=False)
print("open ~/Desktop/bad_model_responses.csv")
if bad_shape and model_responses.shape[0] != 10030:
print("REALLY BAD")
sys.exit(0)
model_responses.to_csv("{:s}joined_runtimes_{:s}.csv".format(path, ending), index=False)
if __name__ == '__main__':
main(sys.argv[1:])
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
import random
import itertools
random.seed(1)
num_attrs = 20
attrs = list(range(1, num_attrs+1))
dims = [2,3,4,5,6]
num_q_per_dim = 10
final_qs = []
for dim in dims:
print("DIM", dim)
all_possible_stats = list(itertools.combinations(attrs, dim))
random.shuffle(all_possible_stats)
all_possible_stats_str = []
for possible_stat in all_possible_stats:
arr = ["0"]*num_attrs
for i in possible_stat:
arr[i-1] = "1"
all_possible_stats_str.append("".join(arr))