|
|
@ -1,20 +1,32 @@ |
|
|
|
import pandas, seaborn |
|
|
|
import pandas, seaborn |
|
|
|
from requests import get |
|
|
|
from requests import get |
|
|
|
|
|
|
|
from sys import argv, exit |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# chack args |
|
|
|
|
|
|
|
cmds = ["and", "max"] |
|
|
|
|
|
|
|
if len(argv) < 2 or argv[1] not in cmds: |
|
|
|
|
|
|
|
print(f"""USAGE:\npython3 main.py <{'|'.join(cmds)}> |
|
|
|
|
|
|
|
max = guess fraudulent if the heuristic for the best column finds it |
|
|
|
|
|
|
|
and = guess fraudulent if ALL the heuristics determined to be good find it""") |
|
|
|
|
|
|
|
exit(-1) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# download dataset if it isn't already present (too large for github) |
|
|
|
try: open("creditcard.csv").read() |
|
|
|
try: open("creditcard.csv").read() |
|
|
|
except FileNotFoundError: |
|
|
|
except FileNotFoundError: |
|
|
|
print("downloading dataset...") |
|
|
|
print("downloading dataset...") |
|
|
|
txt = get("https://the.silly.computer/creditcard.csv").text |
|
|
|
txt = get("https://the.silly.computer/creditcard.csv").text |
|
|
|
open("creditcard.csv", "w").write(txt) |
|
|
|
open("creditcard.csv", "w").write(txt) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# read dataset into dataframe |
|
|
|
data = pandas.read_csv("creditcard.csv") |
|
|
|
data = pandas.read_csv("creditcard.csv") |
|
|
|
data['mean'] = data.mean(axis=1) |
|
|
|
data['mean'] = data.mean(axis=1) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# isolate fraud & legitimate sets |
|
|
|
fraud_set = data.loc[data["Class"] == 1] |
|
|
|
fraud_set = data.loc[data["Class"] == 1] |
|
|
|
legit_set = data.loc[data["Class"] == 0] |
|
|
|
legit_set = data.loc[data["Class"] == 0] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#find the best columns for determining fraud |
|
|
|
good_heuristics = [] |
|
|
|
good_heuristics = [] |
|
|
|
|
|
|
|
|
|
|
|
for col_name in fraud_set.columns: |
|
|
|
for col_name in fraud_set.columns: |
|
|
|
fm = fraud_set[col_name].mean() |
|
|
|
fm = fraud_set[col_name].mean() |
|
|
|
lm = legit_set[col_name].mean() |
|
|
|
lm = legit_set[col_name].mean() |
|
|
@ -33,20 +45,34 @@ for col_name in fraud_set.columns: |
|
|
|
print(lm) |
|
|
|
print(lm) |
|
|
|
accuracy = corr/(corr+incorr) |
|
|
|
accuracy = corr/(corr+incorr) |
|
|
|
print(accuracy) |
|
|
|
print(accuracy) |
|
|
|
if (accuracy > .98) and col_name != "Class": |
|
|
|
if (accuracy > .95) and col_name != "Class": |
|
|
|
print("good heuristic!") |
|
|
|
print("good heuristic!") |
|
|
|
good_heuristics.append({"name": col_name, "fraud_mean": fm, "legit_mean": lm}) |
|
|
|
good_heuristics.append({"name": col_name, "fraud_mean": fm, "legit_mean": lm, "accuracy": accuracy}) |
|
|
|
print("") |
|
|
|
print("") |
|
|
|
|
|
|
|
|
|
|
|
print(good_heuristics) |
|
|
|
print(good_heuristics) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# create new dataframe with guesses based on found heuristics and chosen type (max = best column, and = all good columns must match) |
|
|
|
guessed_class = [] |
|
|
|
guessed_class = [] |
|
|
|
|
|
|
|
best_heuristic = None |
|
|
|
|
|
|
|
best_acc = 0 |
|
|
|
|
|
|
|
for h in good_heuristics: |
|
|
|
|
|
|
|
if h["accuracy"] > best_acc: |
|
|
|
|
|
|
|
best_acc = h["accuracy"] |
|
|
|
|
|
|
|
best_heuristic = h |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"using heuristic: {best_heuristic['name']}") |
|
|
|
for r in data.iterrows(): |
|
|
|
for r in data.iterrows(): |
|
|
|
good = True |
|
|
|
if argv[1] == "and": |
|
|
|
for h in good_heuristics: |
|
|
|
bools = [] |
|
|
|
if abs(r[1][h["name"]] - h["fraud_mean"]) < abs(r[1][h["name"]] - h["legit_mean"]): |
|
|
|
for h in good_heuristics: |
|
|
|
|
|
|
|
bools.append(abs(r[1][h["name"]] - h["fraud_mean"]) < abs(r[1][h["name"]] - h["legit_mean"])) |
|
|
|
|
|
|
|
fraud = all(bools) |
|
|
|
|
|
|
|
guessed_class.append(1 if fraud else 0) |
|
|
|
|
|
|
|
elif argv[1] == "max": |
|
|
|
|
|
|
|
good = True |
|
|
|
|
|
|
|
if abs(r[1][best_heuristic["name"]] - best_heuristic["fraud_mean"]) < abs(r[1][best_heuristic["name"]] - best_heuristic["legit_mean"]): |
|
|
|
good = False |
|
|
|
good = False |
|
|
|
guessed_class.append(0 if good else 1) |
|
|
|
guessed_class.append(0 if good else 1) |
|
|
|
data["guess"] = guessed_class |
|
|
|
data["guess"] = guessed_class |
|
|
|
print(data.head(10)) |
|
|
|
print(data.head(10)) |
|
|
|
data.to_csv("woo.csv") |
|
|
|
data.to_csv("woo.csv") |