From 48cd5d882abd2b3e0cdf8cc6a8560d80e8f15719 Mon Sep 17 00:00:00 2001 From: Sean Lopp Date: Mon, 24 Jan 2022 12:49:22 -0700 Subject: [PATCH 1/2] snippets for xai bqml and looker post --- .../bqml_model.view | 0 .../create_models.sql | 11 +++++++ .../create_training_data.sql | 29 +++++++++++++++++++ .../explain_hypothetical_data.sql | 21 ++++++++++++++ 4 files changed, 61 insertions(+) create mode 100644 finance/explainable-fraud-model-bqml-looker/bqml_model.view create mode 100644 finance/explainable-fraud-model-bqml-looker/create_models.sql create mode 100644 finance/explainable-fraud-model-bqml-looker/create_training_data.sql create mode 100644 finance/explainable-fraud-model-bqml-looker/explain_hypothetical_data.sql diff --git a/finance/explainable-fraud-model-bqml-looker/bqml_model.view b/finance/explainable-fraud-model-bqml-looker/bqml_model.view new file mode 100644 index 0000000..e69de29 diff --git a/finance/explainable-fraud-model-bqml-looker/create_models.sql b/finance/explainable-fraud-model-bqml-looker/create_models.sql new file mode 100644 index 0000000..202b462 --- /dev/null +++ b/finance/explainable-fraud-model-bqml-looker/create_models.sql @@ -0,0 +1,11 @@ +CREATE OR REPLACE MODEL retail_banking.fraud_prediction + OPTIONS(model_type='logistic_reg', labels=['is_fraud']) AS + SELECT * EXCEPT(trans_id) + FROM retail_banking.training_data + -- Account for class imbalance. Alternatively, use AUTO_CLASS_WEIGHTS=True in the model options + WHERE (is_fraud IS TRUE) OR + (is_fraud IS NOT TRUE + AND rand() <=( + SELECT COUNTIF(is_fraud)/COUNT(*) FROM retail_banking.training_data + ) +); \ No newline at end of file diff --git a/finance/explainable-fraud-model-bqml-looker/create_training_data.sql b/finance/explainable-fraud-model-bqml-looker/create_training_data.sql new file mode 100644 index 0000000..15f27df --- /dev/null +++ b/finance/explainable-fraud-model-bqml-looker/create_training_data.sql @@ -0,0 +1,29 @@ + +CREATE OR REPLACE TABLE retail_banking.training_data as ( + SELECT + card_transactions.trans_id AS trans_id, + card_transactions.is_fraud AS is_fraud, + --amount for transaction: higher amounts are more likely to be fraud + cast(card_transactions.amount as FLOAT64) AS card_transactions_amount, + + --distance from the customers home: further distances are more likely to be fraud + ST_DISTANCE((ST_GEOGPOINT((cast(card_transactions.merchant_lon as FLOAT64)), + (cast(card_transactions.merchant_lat as FLOAT64)))), + (ST_GeogPoint((cast(SPLIT(client.address,'|')[ + OFFSET + (4)] as float64)), + (cast(SPLIT(client.address,'|')[ + OFFSET + (3)] as float64))))) AS card_transactions_transaction_distance, + + --hour that transaction occured: fraud occurs in middle of night (usually between midnight and 4 am) + EXTRACT(HOUR FROM TIMESTAMP(CONCAT(card_transactions.trans_date,' ',card_transactions.trans_time)) ) AS card_transactions_transaction_hour_of_day + + FROM `looker-private-demo.retail_banking.card_transactions` AS card_transactions + LEFT JOIN `looker-private-demo.retail_banking.card` AS card + ON card.card_number = card_transactions.cc_number + LEFT JOIN `looker-private-demo.retail_banking.disp` AS disp + ON card.disp_id = disp.disp_id + LEFT JOIN `looker-private-demo.retail_banking.client`AS client + ON disp.client_id = client.client_id +); \ No newline at end of file diff --git a/finance/explainable-fraud-model-bqml-looker/explain_hypothetical_data.sql b/finance/explainable-fraud-model-bqml-looker/explain_hypothetical_data.sql new file mode 100644 index 0000000..b66123f --- /dev/null +++ b/finance/explainable-fraud-model-bqml-looker/explain_hypothetical_data.sql @@ -0,0 +1,21 @@ +SELECT * FROM + ML.EXPLAIN_PREDICT(MODEL retail_banking.fraud_prediction, ( + SELECT + '001' as trans_id, + 500.00 as card_transactions_amount, + 600 as card_transactions_transaction_distance, + 2 as card_transactions_transaction_hour_of_day + UNION ALL + SELECT + '002' as trans_id, + 5.25 as card_transactions_amount, + 2 as card_transactions_transaction_distance, + 13 as card_transactions_transaction_hour_of_day + UNION ALL + SELECT + '003' as trans_id, + 175.50 as card_transactions_amount, + 45 as card_transactions_transaction_distance, + 10 as card_transactions_transaction_hour_of_day + ), STRUCT(0.55 AS threshold) +) \ No newline at end of file From 5e779bbc947c8ab670ca63a2c92ee9b4326dfd91 Mon Sep 17 00:00:00 2001 From: Sean Lopp Date: Mon, 24 Jan 2022 12:51:37 -0700 Subject: [PATCH 2/2] add readme --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 65b9623..868349e 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ From sample dataset to activation, these componentized patterns are designed to ### Financial use cases * Fraud detection * How to build a real-time credit card fraud detection solution. ([Code][ccfraud_code] | [Blogpost][ccfraud_techblog] | [Video][ccfraud_video]) + * How to use explainable AI to understand a fraud prediction. ([Code][xai_code] | [Blogpost][xai_blog]) [gaming_propen_code]: gaming/propensity-model/bqml @@ -48,7 +49,8 @@ From sample dataset to activation, these componentized patterns are designed to [ccfraud_code]: https://gitlab.qdatalabs.com/uk-gtm/patterns/cc_fraud_detection/tree/master [ccfraud_techblog]: https://cloud.google.com/blog/products/data-analytics/how-to-build-a-fraud-detection-solution [ccfraud_video]: https://youtu.be/qQnxq3COr9Q - +[xai_code]: finance/explainable-fraud-model-bqml-looker +[xai_blog]: https://cloud.google.com/blog/products/data-analytics/explainable-ai-using-bigquery-machine-learning-and-looker