From 05ff1904977400b3831fb42ed722069647062c91 Mon Sep 17 00:00:00 2001 From: cynic Date: Thu, 14 Jul 2022 20:48:07 -0400 Subject: [PATCH] finish --- main.py | 42 ++++++++++++++++++++++++++++++++++-------- readme.txt | 8 ++++++++ 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/main.py b/main.py index de575ff..780ad8f 100644 --- a/main.py +++ b/main.py @@ -1,20 +1,32 @@ import pandas, seaborn from requests import get +from sys import argv, exit +# chack args +cmds = ["and", "max"] +if len(argv) < 2 or argv[1] not in cmds: + print(f"""USAGE:\npython3 main.py <{'|'.join(cmds)}> +max = guess fraudulent if the heuristic for the best column finds it +and = guess fraudulent if ALL the heuristics determined to be good find it""") + exit(-1) + +# download dataset if it isn't already present (too large for github) try: open("creditcard.csv").read() except FileNotFoundError: print("downloading dataset...") txt = get("https://the.silly.computer/creditcard.csv").text open("creditcard.csv", "w").write(txt) +# read dataset into dataframe data = pandas.read_csv("creditcard.csv") data['mean'] = data.mean(axis=1) +# isolate fraud & legitimate sets fraud_set = data.loc[data["Class"] == 1] legit_set = data.loc[data["Class"] == 0] +#find the best columns for determining fraud good_heuristics = [] - for col_name in fraud_set.columns: fm = fraud_set[col_name].mean() lm = legit_set[col_name].mean() @@ -33,20 +45,34 @@ for col_name in fraud_set.columns: print(lm) accuracy = corr/(corr+incorr) print(accuracy) - if (accuracy > .98) and col_name != "Class": + if (accuracy > .95) and col_name != "Class": print("good heuristic!") - good_heuristics.append({"name": col_name, "fraud_mean": fm, "legit_mean": lm}) + good_heuristics.append({"name": col_name, "fraud_mean": fm, "legit_mean": lm, "accuracy": accuracy}) print("") - print(good_heuristics) +# create new dataframe with guesses based on found heuristics and chosen type (max = best column, and = all good columns must match) guessed_class = [] +best_heuristic = None +best_acc = 0 +for h in good_heuristics: + if h["accuracy"] > best_acc: + best_acc = h["accuracy"] + best_heuristic = h + +print(f"using heuristic: {best_heuristic['name']}") for r in data.iterrows(): - good = True - for h in good_heuristics: - if abs(r[1][h["name"]] - h["fraud_mean"]) < abs(r[1][h["name"]] - h["legit_mean"]): + if argv[1] == "and": + bools = [] + for h in good_heuristics: + bools.append(abs(r[1][h["name"]] - h["fraud_mean"]) < abs(r[1][h["name"]] - h["legit_mean"])) + fraud = all(bools) + guessed_class.append(1 if fraud else 0) + elif argv[1] == "max": + good = True + if abs(r[1][best_heuristic["name"]] - best_heuristic["fraud_mean"]) < abs(r[1][best_heuristic["name"]] - best_heuristic["legit_mean"]): good = False - guessed_class.append(0 if good else 1) + guessed_class.append(0 if good else 1) data["guess"] = guessed_class print(data.head(10)) data.to_csv("woo.csv") \ No newline at end of file diff --git a/readme.txt b/readme.txt index e69de29..3c553bb 100644 --- a/readme.txt +++ b/readme.txt @@ -0,0 +1,8 @@ +USAGE: +python3 main.py <{'|'.join(cmds)}> +max = guess fraudulent if the heuristic for the best column finds it +and = guess fraudulent if ALL the heuristics determined to be good find it + +the goodness of a heuristic is determined by the proportion of rows it can correctly determine the class of + +i.e. if the accuracy is .90, guess = Class 90% of the time \ No newline at end of file