From 02b78b50292de853885b46a2f9db4424b0bfe0ff Mon Sep 17 00:00:00 2001 From: cynic Date: Tue, 19 Jul 2022 01:04:45 -0400 Subject: [PATCH] switch to machine learning --- main.py | 78 +++++++++++++++++------------------------------------ main_old.py | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++ readme.txt | 8 +----- 3 files changed, 103 insertions(+), 61 deletions(-) create mode 100644 main_old.py diff --git a/main.py b/main.py index 780ad8f..5c6e1ca 100644 --- a/main.py +++ b/main.py @@ -1,13 +1,14 @@ import pandas, seaborn from requests import get from sys import argv, exit +import sklearn +import sklearn.linear_model +import sklearn.metrics +import sklearn.model_selection # chack args -cmds = ["and", "max"] -if len(argv) < 2 or argv[1] not in cmds: - print(f"""USAGE:\npython3 main.py <{'|'.join(cmds)}> -max = guess fraudulent if the heuristic for the best column finds it -and = guess fraudulent if ALL the heuristics determined to be good find it""") +if "help" in argv: + print(f"""USAGE:\npython3 main.py""") exit(-1) # download dataset if it isn't already present (too large for github) @@ -25,54 +26,23 @@ data['mean'] = data.mean(axis=1) fraud_set = data.loc[data["Class"] == 1] legit_set = data.loc[data["Class"] == 0] -#find the best columns for determining fraud -good_heuristics = [] -for col_name in fraud_set.columns: - fm = fraud_set[col_name].mean() - lm = legit_set[col_name].mean() - corr = 0 - incorr = 0 - for r in data.iterrows(): - if abs(r[1][col_name] - fm) < abs(r[1][col_name] - lm): - if r[1]["Class"] == 1: corr += 1 - else: incorr += 1 - elif abs(r[1][col_name] - fm) > abs(r[1][col_name] - lm): - if r[1]["Class"] == 0: corr += 1 - else: incorr += 1 - - print(col_name) - print(fm) - print(lm) - accuracy = corr/(corr+incorr) - print(accuracy) - if (accuracy > .95) and col_name != "Class": - print("good heuristic!") - good_heuristics.append({"name": col_name, "fraud_mean": fm, "legit_mean": lm, "accuracy": accuracy}) - print("") -print(good_heuristics) +#get variables ready for training +x = data.loc[:, "V1":"V28"] +y = data["Class"] -# create new dataframe with guesses based on found heuristics and chosen type (max = best column, and = all good columns must match) -guessed_class = [] -best_heuristic = None -best_acc = 0 -for h in good_heuristics: - if h["accuracy"] > best_acc: - best_acc = h["accuracy"] - best_heuristic = h +xtrain, xtest, ytrain, ytest = sklearn.model_selection.train_test_split(x, y, random_state = 0) -print(f"using heuristic: {best_heuristic['name']}") -for r in data.iterrows(): - if argv[1] == "and": - bools = [] - for h in good_heuristics: - bools.append(abs(r[1][h["name"]] - h["fraud_mean"]) < abs(r[1][h["name"]] - h["legit_mean"])) - fraud = all(bools) - guessed_class.append(1 if fraud else 0) - elif argv[1] == "max": - good = True - if abs(r[1][best_heuristic["name"]] - best_heuristic["fraud_mean"]) < abs(r[1][best_heuristic["name"]] - best_heuristic["legit_mean"]): - good = False - guessed_class.append(0 if good else 1) -data["guess"] = guessed_class -print(data.head(10)) -data.to_csv("woo.csv") \ No newline at end of file +#train +classifier = sklearn.linear_model.SGDClassifier() +classifier.fit(xtrain, ytrain) + +#predict +predictions = classifier.predict(xtest) + +#save +accuracy = sklearn.metrics.accuracy_score(ytest, predictions) +save = input(f"accuracy: {accuracy}, save data to .csv? [y/n]") +data["guess"] = classifier.predict(x).flatten() +print(data) +if save.lower() == "y": + data.to_csv("guesses.csv") \ No newline at end of file diff --git a/main_old.py b/main_old.py new file mode 100644 index 0000000..780ad8f --- /dev/null +++ b/main_old.py @@ -0,0 +1,78 @@ +import pandas, seaborn +from requests import get +from sys import argv, exit + +# chack args +cmds = ["and", "max"] +if len(argv) < 2 or argv[1] not in cmds: + print(f"""USAGE:\npython3 main.py <{'|'.join(cmds)}> +max = guess fraudulent if the heuristic for the best column finds it +and = guess fraudulent if ALL the heuristics determined to be good find it""") + exit(-1) + +# download dataset if it isn't already present (too large for github) +try: open("creditcard.csv").read() +except FileNotFoundError: + print("downloading dataset...") + txt = get("https://the.silly.computer/creditcard.csv").text + open("creditcard.csv", "w").write(txt) + +# read dataset into dataframe +data = pandas.read_csv("creditcard.csv") +data['mean'] = data.mean(axis=1) + +# isolate fraud & legitimate sets +fraud_set = data.loc[data["Class"] == 1] +legit_set = data.loc[data["Class"] == 0] + +#find the best columns for determining fraud +good_heuristics = [] +for col_name in fraud_set.columns: + fm = fraud_set[col_name].mean() + lm = legit_set[col_name].mean() + corr = 0 + incorr = 0 + for r in data.iterrows(): + if abs(r[1][col_name] - fm) < abs(r[1][col_name] - lm): + if r[1]["Class"] == 1: corr += 1 + else: incorr += 1 + elif abs(r[1][col_name] - fm) > abs(r[1][col_name] - lm): + if r[1]["Class"] == 0: corr += 1 + else: incorr += 1 + + print(col_name) + print(fm) + print(lm) + accuracy = corr/(corr+incorr) + print(accuracy) + if (accuracy > .95) and col_name != "Class": + print("good heuristic!") + good_heuristics.append({"name": col_name, "fraud_mean": fm, "legit_mean": lm, "accuracy": accuracy}) + print("") +print(good_heuristics) + +# create new dataframe with guesses based on found heuristics and chosen type (max = best column, and = all good columns must match) +guessed_class = [] +best_heuristic = None +best_acc = 0 +for h in good_heuristics: + if h["accuracy"] > best_acc: + best_acc = h["accuracy"] + best_heuristic = h + +print(f"using heuristic: {best_heuristic['name']}") +for r in data.iterrows(): + if argv[1] == "and": + bools = [] + for h in good_heuristics: + bools.append(abs(r[1][h["name"]] - h["fraud_mean"]) < abs(r[1][h["name"]] - h["legit_mean"])) + fraud = all(bools) + guessed_class.append(1 if fraud else 0) + elif argv[1] == "max": + good = True + if abs(r[1][best_heuristic["name"]] - best_heuristic["fraud_mean"]) < abs(r[1][best_heuristic["name"]] - best_heuristic["legit_mean"]): + good = False + guessed_class.append(0 if good else 1) +data["guess"] = guessed_class +print(data.head(10)) +data.to_csv("woo.csv") \ No newline at end of file diff --git a/readme.txt b/readme.txt index 3c553bb..6469d5a 100644 --- a/readme.txt +++ b/readme.txt @@ -1,8 +1,2 @@ USAGE: -python3 main.py <{'|'.join(cmds)}> -max = guess fraudulent if the heuristic for the best column finds it -and = guess fraudulent if ALL the heuristics determined to be good find it - -the goodness of a heuristic is determined by the proportion of rows it can correctly determine the class of - -i.e. if the accuracy is .90, guess = Class 90% of the time \ No newline at end of file +python3 main.py \ No newline at end of file