From 05ff1904977400b3831fb42ed722069647062c91 Mon Sep 17 00:00:00 2001
From: cynic <admin@cynic.moe>
Date: Thu, 14 Jul 2022 20:48:07 -0400
Subject: [PATCH] finish

---
 main.py    | 42 ++++++++++++++++++++++++++++++++++--------
 readme.txt |  8 ++++++++
 2 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/main.py b/main.py
index de575ff..780ad8f 100644
--- a/main.py
+++ b/main.py
@@ -1,20 +1,32 @@
 import pandas, seaborn
 from requests import get
+from sys import argv, exit
 
+# chack args
+cmds = ["and", "max"]
+if len(argv) < 2 or argv[1] not in cmds:
+    print(f"""USAGE:\npython3 main.py <{'|'.join(cmds)}>
+max = guess fraudulent if the heuristic for the best column finds it
+and = guess fraudulent if ALL the heuristics determined to be good find it""")
+    exit(-1)
+
+# download dataset if it isn't already present (too large for github)
 try: open("creditcard.csv").read()
 except FileNotFoundError:
     print("downloading dataset...")
     txt = get("https://the.silly.computer/creditcard.csv").text
     open("creditcard.csv", "w").write(txt)
 
+# read dataset into dataframe
 data = pandas.read_csv("creditcard.csv")
 data['mean'] = data.mean(axis=1)
 
+# isolate fraud & legitimate sets
 fraud_set = data.loc[data["Class"] == 1]
 legit_set = data.loc[data["Class"] == 0]
 
+#find the best columns for determining fraud
 good_heuristics = []
-
 for col_name in fraud_set.columns:
     fm = fraud_set[col_name].mean()
     lm = legit_set[col_name].mean()
@@ -33,20 +45,34 @@ for col_name in fraud_set.columns:
     print(lm)
     accuracy = corr/(corr+incorr)
     print(accuracy)
-    if (accuracy > .98) and col_name != "Class":
+    if (accuracy > .95) and col_name != "Class":
         print("good heuristic!")
-        good_heuristics.append({"name": col_name, "fraud_mean": fm, "legit_mean": lm})
+        good_heuristics.append({"name": col_name, "fraud_mean": fm, "legit_mean": lm, "accuracy": accuracy})
     print("")
-
 print(good_heuristics)
 
+# create new dataframe with guesses based on found heuristics and chosen type (max = best column, and = all good columns must match)
 guessed_class = []
+best_heuristic = None
+best_acc = 0
+for h in good_heuristics:
+    if h["accuracy"] > best_acc:
+        best_acc = h["accuracy"]
+        best_heuristic = h
+
+print(f"using heuristic: {best_heuristic['name']}")
 for r in data.iterrows():
-    good = True
-    for h in good_heuristics:
-        if abs(r[1][h["name"]] - h["fraud_mean"]) < abs(r[1][h["name"]] - h["legit_mean"]):
+    if argv[1] == "and":
+        bools = []
+        for h in good_heuristics:
+            bools.append(abs(r[1][h["name"]] - h["fraud_mean"]) < abs(r[1][h["name"]] - h["legit_mean"]))
+        fraud = all(bools)
+        guessed_class.append(1 if fraud else 0)
+    elif argv[1] == "max":
+        good = True
+        if abs(r[1][best_heuristic["name"]] - best_heuristic["fraud_mean"]) < abs(r[1][best_heuristic["name"]] - best_heuristic["legit_mean"]):
             good = False
-    guessed_class.append(0 if good else 1)
+        guessed_class.append(0 if good else 1)
 data["guess"] = guessed_class
 print(data.head(10))
 data.to_csv("woo.csv")
\ No newline at end of file
diff --git a/readme.txt b/readme.txt
index e69de29..3c553bb 100644
--- a/readme.txt
+++ b/readme.txt
@@ -0,0 +1,8 @@
+USAGE:
+python3 main.py <{'|'.join(cmds)}>
+max = guess fraudulent if the heuristic for the best column finds it
+and = guess fraudulent if ALL the heuristics determined to be good find it
+
+the goodness of a heuristic is determined by the proportion of rows it can correctly determine the class of
+
+i.e. if the accuracy is .90, guess = Class 90% of the time
\ No newline at end of file