From 315bafd6dc31bfe5e09fddb4a2fb919679757b2a Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 25 Feb 2026 13:14:56 +0930 Subject: [PATCH] week2 --- config.yaml | 4 ++ logs/pipeline.log | 64 ++++++++++++++++++ main.py | 19 ++++++ output/rf_scored_dataset.csv | 10 +++ output/unified_dataset.csv | 10 +++ output/unified_dataset_cleaned.csv | 10 +++ output/unified_dataset_deliverable.csv | 10 +++ output/unified_dataset_ethics.csv | 10 +++ output/unified_dataset_with_triggers.csv | 10 +++ requirement.txt | 0 sample_data/clickstream.csv | 10 +++ sample_data/crm.csv | 8 +++ src/__init__.py | 0 src/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 161 bytes src/automation/__init__.py | 0 src/pipeline/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 170 bytes .../__pycache__/config_loader.cpython-312.pyc | Bin 0 -> 812 bytes .../deliverability_check.cpython-312.pyc | Bin 0 -> 1003 bytes .../__pycache__/ethics_flags.cpython-312.pyc | Bin 0 -> 1287 bytes .../__pycache__/list_hygiene.cpython-312.pyc | Bin 0 -> 1006 bytes .../relative_date_trigger.cpython-312.pyc | Bin 0 -> 1075 bytes .../__pycache__/rf_scoring.cpython-312.pyc | Bin 0 -> 1305 bytes .../unified_stream.cpython-312.pyc | Bin 0 -> 2344 bytes src/pipeline/config_loader.py | 9 +++ src/pipeline/deliverability_check.py | 15 ++++ src/pipeline/ethics_flags.py | 27 ++++++++ src/pipeline/list_hygiene.py | 15 ++++ src/pipeline/relative_date_trigger.py | 15 ++++ src/pipeline/rf_scoring.py | 18 +++++ src/pipeline/unified_stream.py | 32 +++++++++ wiki/gdpr-audit.md | 54 +++++++++++++++ 32 files changed, 350 insertions(+) create mode 100644 config.yaml create mode 100644 logs/pipeline.log create mode 100644 main.py create mode 100644 output/rf_scored_dataset.csv create mode 100644 output/unified_dataset.csv create mode 100644 output/unified_dataset_cleaned.csv create mode 100644 output/unified_dataset_deliverable.csv create mode 100644 output/unified_dataset_ethics.csv create mode 100644 output/unified_dataset_with_triggers.csv create mode 100644 requirement.txt create mode 100644 sample_data/clickstream.csv create mode 100644 sample_data/crm.csv create mode 100644 src/__init__.py create mode 100644 src/__pycache__/__init__.cpython-312.pyc create mode 100644 src/automation/__init__.py create mode 100644 src/pipeline/__init__.py create mode 100644 src/pipeline/__pycache__/__init__.cpython-312.pyc create mode 100644 src/pipeline/__pycache__/config_loader.cpython-312.pyc create mode 100644 src/pipeline/__pycache__/deliverability_check.cpython-312.pyc create mode 100644 src/pipeline/__pycache__/ethics_flags.cpython-312.pyc create mode 100644 src/pipeline/__pycache__/list_hygiene.cpython-312.pyc create mode 100644 src/pipeline/__pycache__/relative_date_trigger.cpython-312.pyc create mode 100644 src/pipeline/__pycache__/rf_scoring.cpython-312.pyc create mode 100644 src/pipeline/__pycache__/unified_stream.cpython-312.pyc create mode 100644 src/pipeline/config_loader.py create mode 100644 src/pipeline/deliverability_check.py create mode 100644 src/pipeline/ethics_flags.py create mode 100644 src/pipeline/list_hygiene.py create mode 100644 src/pipeline/relative_date_trigger.py create mode 100644 src/pipeline/rf_scoring.py create mode 100644 src/pipeline/unified_stream.py create mode 100644 wiki/gdpr-audit.md diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..680c808 --- /dev/null +++ b/config.yaml @@ -0,0 +1,4 @@ +crm_file: "sample_data/crm.csv" +clickstream_file: "sample_data/clickstream.csv" +output_dir: "output" +log_dir: "logs" diff --git a/logs/pipeline.log b/logs/pipeline.log new file mode 100644 index 0000000..1cb4cb9 --- /dev/null +++ b/logs/pipeline.log @@ -0,0 +1,64 @@ +INFO:root:=== WEEK 2 PIPELINE START === +INFO:root:Loading CRM and clickstream data... +INFO:root:=== WEEK 2 PIPELINE START === +INFO:root:Loading CRM and clickstream data... +INFO:root:=== WEEK 2 PIPELINE START === +INFO:root:Loading CRM and clickstream data... +INFO:root:=== WEEK 2 PIPELINE START === +INFO:root:Loading CRM and clickstream data... +INFO:root:=== WEEK 2 PIPELINE START === +INFO:root:Loading CRM and clickstream data... +INFO:root:=== WEEK 2 PIPELINE START === +INFO:root:Loading CRM and clickstream data... +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv diff --git a/main.py b/main.py new file mode 100644 index 0000000..c4f3696 --- /dev/null +++ b/main.py @@ -0,0 +1,19 @@ +# main.py +from src.pipeline.unified_stream import load_and_merge +from src.pipeline.relative_date_trigger import apply_birthday_recursion +from src.pipeline.rf_scoring import calculate_rf_score +from src.pipeline.list_hygiene import apply_list_hygiene +from src.pipeline.ethics_flags import apply_ethics_checks +from src.pipeline.deliverability_check import run_deliverability_check + +print("=== WEEK 2 PIPELINE START ===") + +load_and_merge() +apply_birthday_recursion() +calculate_rf_score() +apply_list_hygiene() + +apply_ethics_checks() +run_deliverability_check() + +print("=== WEEK 2 PIPELINE COMPLETE ===") diff --git a/output/rf_scored_dataset.csv b/output/rf_scored_dataset.csv new file mode 100644 index 0000000..154cba3 --- /dev/null +++ b/output/rf_scored_dataset.csv @@ -0,0 +1,10 @@ +user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score +4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.034482758620689655 +3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.02040816326530612 +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.02564102564102564 +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.045454545454545456 +7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.006211180124223602 +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.015384615384615385 +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.07142857142857142 +6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04 +5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388345 diff --git a/output/unified_dataset.csv b/output/unified_dataset.csv new file mode 100644 index 0000000..3241e63 --- /dev/null +++ b/output/unified_dataset.csv @@ -0,0 +1,10 @@ +user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id +4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401 +3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301 +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101 +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102 +7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701 +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201 +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202 +6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601 +5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501 diff --git a/output/unified_dataset_cleaned.csv b/output/unified_dataset_cleaned.csv new file mode 100644 index 0000000..42d281b --- /dev/null +++ b/output/unified_dataset_cleaned.csv @@ -0,0 +1,10 @@ +user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score +4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.0344827586206896 +3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.0204081632653061 +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.0256410256410256 +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.0454545454545454 +7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.0062111801242236 +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.0153846153846153 +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.0714285714285714 +6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04 +5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388 diff --git a/output/unified_dataset_deliverable.csv b/output/unified_dataset_deliverable.csv new file mode 100644 index 0000000..90771ec --- /dev/null +++ b/output/unified_dataset_deliverable.csv @@ -0,0 +1,10 @@ +user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score,deliverable +4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.0344827586206896,True +3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.0204081632653061,True +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.0256410256410256,True +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.0454545454545454,True +7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.0062111801242236,True +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.0153846153846153,True +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.0714285714285714,True +6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04,True +5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388,True diff --git a/output/unified_dataset_ethics.csv b/output/unified_dataset_ethics.csv new file mode 100644 index 0000000..abbfeac --- /dev/null +++ b/output/unified_dataset_ethics.csv @@ -0,0 +1,10 @@ +user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score,sensitive_flag +4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.0344827586206896,False +3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.0204081632653061,False +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.0256410256410256,False +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.0454545454545454,False +7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.0062111801242236,False +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.0153846153846153,False +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.0714285714285714,False +6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04,False +5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388,False diff --git a/output/unified_dataset_with_triggers.csv b/output/unified_dataset_with_triggers.csv new file mode 100644 index 0000000..265c897 --- /dev/null +++ b/output/unified_dataset_with_triggers.csv @@ -0,0 +1,10 @@ +user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger +4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30 +3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21 +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17 +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17 +7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15 +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05 +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05 +6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01 +5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10 diff --git a/requirement.txt b/requirement.txt new file mode 100644 index 0000000..e69de29 diff --git a/sample_data/clickstream.csv b/sample_data/clickstream.csv new file mode 100644 index 0000000..473f134 --- /dev/null +++ b/sample_data/clickstream.csv @@ -0,0 +1,10 @@ +user_id,timestamp,page_depth,dwell_time,session_id +1,2026-01-15 09:15:00,5,120,101 +1,2026-02-01 10:20:00,3,90,102 +2,2025-12-20 14:10:00,4,60,201 +2,2026-02-10 08:50:00,6,180,202 +3,2026-01-05 12:30:00,2,30,301 +4,2026-01-25 16:45:00,3,60,401 +5,2025-08-01 11:00:00,5,150,501 +6,2026-01-30 09:00:00,4,120,601 +7,2025-09-15 17:10:00,2,45,701 diff --git a/sample_data/crm.csv b/sample_data/crm.csv new file mode 100644 index 0000000..6c6031c --- /dev/null +++ b/sample_data/crm.csv @@ -0,0 +1,8 @@ +user_id,email,birth_date,gender,consent_flag +1,alice@gmail.com,1990-02-17,F,True +2,bob@yahoo.com,1985-07-05,M,True +3,charlie@tempmail.com,2000-11-21,M,False +4,dana@outlook.com,1992-05-30,F,True +5,eric@mailinator.com,1988-09-10,M,True +6,frank@gmail.com,1995-12-01,M,True +7,grace@no-reply.com,1993-03-15,F,True diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/__pycache__/__init__.cpython-312.pyc b/src/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e388c68dd791f1006959ca1099a7da34b24b3c4 GIT binary patch literal 161 zcmX@j%ge<81TW&JWP<3&AOanHW&w&!XQ*V*Wb|9fP{ah}eFmxd<&bO@6Iz^FR2<`& zoLUs)lAm0fo0?Zr9OIc+l3J7(layIhl97^FnVVW%oS2@OmmX7GlpGTupP83g5+AQu iQ2C3)CO1E&G$+-rh!tocBM=vZ7$2D#85xV1fh+(X1Szrr literal 0 HcmV?d00001 diff --git a/src/automation/__init__.py b/src/automation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/pipeline/__init__.py b/src/pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/pipeline/__pycache__/__init__.cpython-312.pyc b/src/pipeline/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9f12bf597844da333dde3c3c55524f28a252b9bf GIT binary patch literal 170 zcmX@j%ge<81j%tzGC}lX5P=Rpvj9b=GgLBYGWxA#C}INgK7-W!@=Ugh2`x@7DvohX zPA!UY$xklLP0cGQj`7SZNiE8YNy;oL$w*18%uOvWPE60tOOGioN{%VWEJ)4C%u9`l rkI&4@EQycTE2#X%VUwGmQks)$SHuc5nGuMKL5z>gjEsy$%s>_ZovAG6 literal 0 HcmV?d00001 diff --git a/src/pipeline/__pycache__/config_loader.cpython-312.pyc b/src/pipeline/__pycache__/config_loader.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..62173ca354e3c49c723b765e5cd04a7ed8da96d7 GIT binary patch literal 812 zcmcgpJ!sod5PnaVZOOLWLPHlrGjs}pkh&D5X{oSD3t2L>NlSzpp-4}PjQ*nMkf@W# zhz{yds57NwO@^$^6v$NQs3p8irBgST3;{!-k`<9GS#t;X-F@%wzJvFZ<0>FAUHNSO zOYnDE)UZ_cF?M8$*T{?u+lv2??Q zce1(b5tTIRfXWz=p#WkLqZM-I2@op@2Rrw#&ty5OcF%u}ZMW8>$5z57c9+$MIbT>#@Gkr&~5h*L{c+4=e zteX~+Wx}*uZ(hm^i?0dYaZQI@Zj;AtwCf>+~TQ+jKSXy+t$akc5)9-F*N?;?56~lB43Hz$#nI5uC2T9qo@n+=Dc)^Hy zZoAr+gO#!`mV>E*6&6}Xhc?M_2<66kL!dzE_de|Tw#U>WPFUV1C!){`^aZfVt{AME$ddoz#k&Cbl%WU?2?(5?5R zM+4wMfiNTvqI+GI7C-<&2NYmXwStB;6mcRIy`V#Y^p~Izt;1aOP)X(wq0Bib4BK-{ zWXV=!7`hxpOga6}G~wsWf706ZSfm4l6i=%hBl zjxr`=12!OvD0TT#KwL=W{tGYwNDsAA8J(5fe{gP`IoBGnJn(ndhK71R$HG+*G~iQ^ zi_eQ*ug0r2UPOehXKF4f5sZq+;ugcaXgk<)F`Bg5s)*u>MVw=JS0oU2$SS7RqJ!7R z{?weCRJsT9@S?;;cr6zfn#Y7*wfM5o%N}v5IzgdTk%&=jp`vONVVsJXJZ)|fmkEP= ziq1tuFL9O65^=E5P)Q_CjVpqwYmm)mj~DFQ<^sc%nfGi=&HJ8Rt6-NiGw*UtU2~C8 zzKpDT1v6$X5qHUC)HbW6DtF{!^UOPO2(K6IWo)lZR_h{7Yi{vhlXO@%R=0?)$>f9m zfmCa_Iouj;j&7B<&GywByVH+515f;i`|-rv%$v-+LMNW}=Z*|8lKF6JGx4#1v;TW~ zs-2$tlIo;q{5gN4tq=VgyxbaZj<+V76I-j>PJ8U;?##2!;B)`co^HI#zsSE{*&g!q z-}LdlMAFZ9Wp9u(gi*CzWHH(&Ax+36k|)om=y(>wH2B|v*{gX7l?iXILzxu literal 0 HcmV?d00001 diff --git a/src/pipeline/__pycache__/ethics_flags.cpython-312.pyc b/src/pipeline/__pycache__/ethics_flags.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6f93269d1b967e062ca21839bb32eb52857afbf4 GIT binary patch literal 1287 zcmb_cUx*t;7@yhQY?GX+rfDoS(Y-6wBT$Q2Jw$?_9@;29=p7=2NZ9U7a+7XmmzmAk zlnq1>tWc;CAwJy0;hS6^d=h=q`sz#4h%M6!VxRg}d%oBwzfH2`IA8r>zxn3(ecyb4 z=9~Q`lSw0>KQDaAzS9x97j0tF=EC0dAPf;f1iPq;r>j=gh(>f*ZyQwuBVv4ls);Tx zCGJN~Yxgnm5>}?;@dj%;5g1bf1PF!pw!vtKf*41wIN)Ohp;lylfC4iH2tE!X(RSya zJrty3U29rcyqm2d+Kmze>ft6L`q#Lph1x2*raM{_IogLg(CDEa;?@&0yZw9%NqqH4 zs1LNOc?0zfur$PSh(jaFqV?o{+nKyOhg#YH>WFbY5oZ%(VwS;#C&-~#{vZ01eB74= zR0Ophq!8j=PsLUkg?IzHfZ^-uAE8ouO{Kj~;CF&LVd8S3!&!q-QYUs`OB&Q2m)e|? z(~jIK=_=c%ggLfbhdU=MU>;Xl-*Z8VK+k79f^vk>z!u%QpfV85iR?r)4lpf-eThkR zSW+&*Ym3$!uHAh5@-qkH(qNN0avBz^1mlEWDy5a?NoDwUu&Ioe$G9-yP^I}qB?Yxf z9g9D;&AZAByhvFxRhsdr)&_H_(nw=}Cg3QQ4qRm@BpEQ?ggOCR zm^UbLlF?u;PU+D(=e64Vk_uUS!=a+K>N%Y@<$EBzCt=rL>!HTuln6 z<};tVjMLhz#AyTMX}_y-w(q;$`W$vPsdG_^XCYh^QvM9AkM3r3!()SE!{VU0-Poy( zpL=Qa^7%=2z5njr)WWsGXNAwJlhk7W&4(s>`UBiwdtjoIXMQ_+_N$F=o$uFwto~5F zx$@2z8>6FV`{#iDWuj#MwzahL!q|H8mbEgrRwmY~x2*EmD*v=JvdR-{^@=uHDBMgI z;i$!v)6FBpMm6`og>aJ%KDyFwQZNxWd u57bB$fjvr=+aBq-^mTCpw$YoH(fB{mF~;{$?hZ=*j+XX}JU%Rz0RI<&oIo-F literal 0 HcmV?d00001 diff --git a/src/pipeline/__pycache__/list_hygiene.cpython-312.pyc b/src/pipeline/__pycache__/list_hygiene.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9119a494f910e6e68f2cea3d0e0bede67afc83b0 GIT binary patch literal 1006 zcmb_b%}*0S6rb5{mu_Jtd{kmev>t#oAsqa;5Mm-RVnsPXywpwBb!S?p?pJ2EgpxKX ziHUH6CMLab;A-%X@b0BeO(ZLx@WM$FZl0W3SQWkcHv49Nzj<%oZ)e_q)U-6<_Wi|M zB1-@q3lJN}k)2KQp$!l~&;~^qjZ#rU5=z(!M=r_`Ao(>YCTlRCJP|2NCy@I*3>C|B zOQdWGGPWe2B=~za%Y)h=h3YSyczS0n-a^fT^p$Bs5q>X3fEjnPuXZbw(}SCqA}`i*=i@z*w!733l;}Ukis# z-?wYVpESM7ONstro7^?Q(a=bHtTomiZ;kJj_Vw=dTb(#Ln<0K)Xx7L z!}d&sX>`<)_IH4eiJ%O3KT>m!hpINdN3$G?)@CEzohBfJ$6(|OP``qkXR->fL`gpZ DLNx0U literal 0 HcmV?d00001 diff --git a/src/pipeline/__pycache__/relative_date_trigger.cpython-312.pyc b/src/pipeline/__pycache__/relative_date_trigger.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f048b2f6179c6c3069384d7fda219d366a7752b6 GIT binary patch literal 1075 zcmb_b&1(}u6rbH}woPJ+ZPd2*!xX_1C_VUbkt$k>kP7u6Z8a=i*PTf^akIP3Y^o_K zL=@DM2819K?6K1R6N2EuOG0`mV?EVNZ$dovFZ{El6&3ixI?6-J43b-6R zdXF#402~S@5bpcZy2OtffB;Gs$VmPvXJjbDkQFK@8ASq6c?mM%l9UP`iIRmQiR+Za zRMWAuIA@ArAPP4^{A(?7Pz`uN9OQ$BClLVh;V!3RdTJ1Z(l9rnywj=N0A3`><$Yp- z*FMW&M-0YgL+V1FYR9^DS%M*PcYyCg_jTjiHdJ=Q0dk%9<<;s8PyGjOJty4cvXt-t zw|7~Fy}OYBCGp<(K`OeyqE69si=GZKSx*#goW%(0(C`e3ylIoJvFKw=yj2lGMC?PX ztKit)DrKkr6f{b@M{q8O$oj-zH|S3>ukB>1xbF_H4`UmmXQ>F29m*8f@K%|Ucd$*w z&YA2&)=Q89bs7Z8ii@hR?QvwSwD@&pH@LM)1H!61t*#AKLdu?{5>pkm}? zGI_jYUe^{WB2>F=BBISXX0d>5k7{X~6SK8|N&!)7slo}DAx%xi-cmwCWE}Qc!Kl=R~2y3pQsI12Wz9%(ar3Z))>EBznX6L z&sXLTBC)q4Z${o_nvrB>_O}YglOI<;tZdJ2mm5KEmSaFf{4My?e*$DkfX;Ds+&fcnV9`Q1$tZWl|3MR6 X{R&BvbO;jPLF5ORXekk?&oBA|$)NtR literal 0 HcmV?d00001 diff --git a/src/pipeline/__pycache__/rf_scoring.cpython-312.pyc b/src/pipeline/__pycache__/rf_scoring.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9741292bcc8a7ecbfab7dfbe2a56e59c856aa409 GIT binary patch literal 1305 zcmb_cO=ufO6rTN&Vo6@7mZO@kQzdRgjKB>vZ7yvnX<`a$TAMaMEn;EW?u_JBSG!_n zl+;3QH&AGEN(}}1kb?__Qj&vnYR)|sdhw!y%_N0Fa>=dWoN~$=*<+QQ`(WSu-kbM3 zGdq7;RvrO8_I?b$))9In6-A&AY4;?EKEeoVA*yLZsn>L@V?dj%lY@9ame zBA!!g&Ea55+mk(XM>Ynus`0Ts1U1GL8p-D25mpErcg=%%y{G@D)=SD~Bzqdp$nI1` z7y4-yN9zhTk0>;f5Su*DgRX7)HmO~cX%*+E}c2IZ7-zu z!nR#a?P_v9`6e}||0%t^{@S{m+HY)Xsr_bJI=%VnZ&&YKy}$IwCxbIfX=w=@yEhge z7mxJk*5>;2YxB3SZG`Eux37fK6rQ!$YsYJcB&I;p5R51UH;J1<0&0>*Y6EH0{20SPII042v%QgK|w9_-Ed zveLLDAP*uaUCSL+DUq<)2uti+l^%I2^0#XH->5C7>#9CE2N4@MU#02ysyyc)i;CX> z&w+^P1AMniS8MpbfN=0fG#k_L^uoy*%vteM|{ zI6$szmYp=NQGdpmWYU>XPRo&urjs~GVjdcp&@x0K^B~&ew3bv6zqgQ;LKMZ%%p}bmO}LP1+^z}2_#nHGnJT$bB3DF65X~E zL%K~&J#ket?G#p~v&6ELNll+jSf-lbASLp;HlY!mgm6U3#&gpOFa2Qk@=hv9Fv z!c;_?a>HEzZ2yB}3zPTT*PA+=rjBycM`gKlUG8z@p0eCm68pA56{&3EgS~VY=N#br zE}XHzkrQ@dp61oUeFC)ue1Sde9do-I|J}~J$4+?r3&0|eAfN)&7_&{wJclZ^L!9UO zpe^-*ups;|E1_cbj;#DV${xJl&>adazJvo%chHO2xZ$b3>NnMV!GnWthmco8yjGcv zoyFei2I>8-vprM#;GU(s*mYlF*J~~JdBa=qq?zbnqu|B+R_nMHobRm_it7vB`{<#k zibuf%71l5|b8N&=Fg*U~z{Sz1qT{HV(bQ{|M<2zCt;FMT8c@w_lC=w3w`<##%$)oZ zs)9LSg%KS(XGJFrGn%tV9vk{}R+~|5&CtP%$vP*k-L9V`D<*ss|Z{Ai-)^uhfyi8fDU>;YksX@^2X*wp?XBsb6 zs+fC^TFMl`QQL^d!ft0|p{c0p69)Cc}SQvK>oGb@p#evOG?cDI}@B^uI zDjgLKyh%hsd*{B7+*Q? zSaxE4KYy^+^hI&-cCRBIeA%#nzV&YFeEZ$@2UE)#=g8?&_gCeH;o|Uy7`!!lbM#Kz z@}=VFn%KtA7HMDkc4f3w-?Jw6Zpih;Q8NN(kJXyTflw(2;3$?Eg@wr)i`l}&;mX%9 z$>1Yq<$`R!%%_bUy5T`d#_($VSuArS`@4zaYxVKP5r< MbW5re8r-!00V^#1R{#J2 literal 0 HcmV?d00001 diff --git a/src/pipeline/config_loader.py b/src/pipeline/config_loader.py new file mode 100644 index 0000000..df6bac7 --- /dev/null +++ b/src/pipeline/config_loader.py @@ -0,0 +1,9 @@ +import os +import yaml + +# Project root +BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +config_path = os.path.join(BASE_DIR, "config.yaml") + +with open(config_path, "r") as f: + config = yaml.safe_load(f) diff --git a/src/pipeline/deliverability_check.py b/src/pipeline/deliverability_check.py new file mode 100644 index 0000000..6585405 --- /dev/null +++ b/src/pipeline/deliverability_check.py @@ -0,0 +1,15 @@ +# src/pipeline/deliverability_check.py +import pandas as pd +import os +from .config_loader import config + +def run_deliverability_check(): + input_file = os.path.join(config["output_dir"], "unified_dataset_cleaned.csv") + df = pd.read_csv(input_file) + + # Basic simulation of SPF/DKIM validation + df["deliverable"] = df["email"].str.contains("@") + + output_file = os.path.join(config["output_dir"], "unified_dataset_deliverable.csv") + df.to_csv(output_file, index=False) + return df diff --git a/src/pipeline/ethics_flags.py b/src/pipeline/ethics_flags.py new file mode 100644 index 0000000..688e41d --- /dev/null +++ b/src/pipeline/ethics_flags.py @@ -0,0 +1,27 @@ +# src/pipeline/ethics_flags.py +import pandas as pd +import os +from .config_loader import config + +def apply_ethics_checks(): + input_file = os.path.join(config["output_dir"], "unified_dataset_cleaned.csv") + df = pd.read_csv(input_file) + + # List of sensitive columns to check + sensitive_cols = ["medical_condition", "political_opinion", "dietary_restriction"] + + # Only include columns that exist in the dataset + existing_cols = [col for col in sensitive_cols if col in df.columns] + + # Apply sensitive flag if any sensitive info is present + if existing_cols: + df["sensitive_flag"] = df[existing_cols].notna().any(axis=1) + else: + # If none exist, create the flag with False + df["sensitive_flag"] = False + + # Save output + output_file = os.path.join(config["output_dir"], "unified_dataset_ethics.csv") + df.to_csv(output_file, index=False) + + return df diff --git a/src/pipeline/list_hygiene.py b/src/pipeline/list_hygiene.py new file mode 100644 index 0000000..fdf6af1 --- /dev/null +++ b/src/pipeline/list_hygiene.py @@ -0,0 +1,15 @@ +# src/pipeline/list_hygiene.py +import pandas as pd +import os +from .config_loader import config + +def apply_list_hygiene(): + input_file = os.path.join(config["output_dir"], "rf_scored_dataset.csv") + df = pd.read_csv(input_file) + + # Remove invalid domains, disposable emails + df = df[~df["email"].str.contains("example.com|test.com", na=False)] + + output_file = os.path.join(config["output_dir"], "unified_dataset_cleaned.csv") + df.to_csv(output_file, index=False) + return df diff --git a/src/pipeline/relative_date_trigger.py b/src/pipeline/relative_date_trigger.py new file mode 100644 index 0000000..b55995b --- /dev/null +++ b/src/pipeline/relative_date_trigger.py @@ -0,0 +1,15 @@ +# src/pipeline/relative_date_trigger.py +import pandas as pd +import os +from .config_loader import config + +def apply_birthday_recursion(): + input_file = os.path.join(config["output_dir"], "unified_dataset.csv") + df = pd.read_csv(input_file) + + # Example: 11-month pre-birthday trigger + df["birthday_trigger"] = pd.to_datetime(df["birth_date"]) - pd.DateOffset(months=1) + + output_file = os.path.join(config["output_dir"], "unified_dataset_with_triggers.csv") + df.to_csv(output_file, index=False) + return df diff --git a/src/pipeline/rf_scoring.py b/src/pipeline/rf_scoring.py new file mode 100644 index 0000000..ae80bcd --- /dev/null +++ b/src/pipeline/rf_scoring.py @@ -0,0 +1,18 @@ +# src/pipeline/rf_scoring.py +import pandas as pd +import os +from .config_loader import config + +def calculate_rf_score(): + input_file = os.path.join(config["output_dir"], "unified_dataset_with_triggers.csv") + df = pd.read_csv(input_file) + + # Simple RF scoring (Recency-Frequency) + now = pd.Timestamp.now(tz="UTC") + df["recency_index"] = (now - pd.to_datetime(df["timestamp"])).dt.days + + df["rf_score"] = 1 / (df["recency_index"] + 1) + df.get("session_count", 0) + + output_file = os.path.join(config["output_dir"], "rf_scored_dataset.csv") + df.to_csv(output_file, index=False) + return df diff --git a/src/pipeline/unified_stream.py b/src/pipeline/unified_stream.py new file mode 100644 index 0000000..d27f90d --- /dev/null +++ b/src/pipeline/unified_stream.py @@ -0,0 +1,32 @@ +# src/pipeline/unified_stream.py +import os +import pandas as pd +import hashlib +import logging +from .config_loader import config + +os.makedirs(config["output_dir"], exist_ok=True) +os.makedirs(config["log_dir"], exist_ok=True) +logging.basicConfig(filename=os.path.join(config["log_dir"], "pipeline.log"), + level=logging.INFO) + +def hash_user_id(user_id): + return hashlib.sha256(str(user_id).encode()).hexdigest() + +def load_and_merge(): + logging.info("Loading CRM and clickstream data...") + crm = pd.read_csv(config["crm_file"]) + click = pd.read_csv(config["clickstream_file"]) + + logging.info("Hashing user IDs for pseudonymization...") + crm["user_id_hashed"] = crm["user_id"].apply(hash_user_id) + click["user_id_hashed"] = click["user_id"].apply(hash_user_id) + + logging.info("Merging datasets...") + merged = pd.merge(crm, click, on="user_id_hashed", how="outer") + merged["timestamp"] = pd.to_datetime(merged["timestamp"], utc=True) + + output_file = os.path.join(config["output_dir"], "unified_dataset.csv") + merged.to_csv(output_file, index=False) + logging.info(f"Unified dataset saved to {output_file}") + return merged diff --git a/wiki/gdpr-audit.md b/wiki/gdpr-audit.md new file mode 100644 index 0000000..3ccea82 --- /dev/null +++ b/wiki/gdpr-audit.md @@ -0,0 +1,54 @@ +# \# GDPR Adequacy Checklist + +# + +# \## Governance \& Breach Protocols + +# \- DPO appointed and documented + +# \- 72-hour breach notification process defined + +# + +# \## Consent \& Data Collection + +# \- No pre-ticked marketing consent boxes + +# \- Explicit consent required for birthday listings + +# + +# \## Sensitive Data Handling + +# \- Medical data restricted to "unwell" status + +# \- Dietary data shared only with explicit consent + +# + +# \## International Data Transfers + +# \- Photo/video sharing audited for adequacy mechanisms + +# + + +## Data Protection Officer (DPO) +Name: [Your Name Here] +Role: [Your Role Here] +Contact: [email@example.com] +Date Appointed: [YYYY-MM-DD] + +## Data Protection Officer (DPO) +Name: +Role: +Contact: +Date Appointed: + +## Regulatory Feature Specifications + +### Right of Access (Article 15) +Users can request a downloadable summary of stored personal data. + +### Right of Erasure (Article 17) +Users can trigger full deletion across all systems and subprocessors.