diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..680c808 --- /dev/null +++ b/config.yaml @@ -0,0 +1,4 @@ +crm_file: "sample_data/crm.csv" +clickstream_file: "sample_data/clickstream.csv" +output_dir: "output" +log_dir: "logs" diff --git a/logs/pipeline.log b/logs/pipeline.log new file mode 100644 index 0000000..1cb4cb9 --- /dev/null +++ b/logs/pipeline.log @@ -0,0 +1,64 @@ +INFO:root:=== WEEK 2 PIPELINE START === +INFO:root:Loading CRM and clickstream data... +INFO:root:=== WEEK 2 PIPELINE START === +INFO:root:Loading CRM and clickstream data... +INFO:root:=== WEEK 2 PIPELINE START === +INFO:root:Loading CRM and clickstream data... +INFO:root:=== WEEK 2 PIPELINE START === +INFO:root:Loading CRM and clickstream data... +INFO:root:=== WEEK 2 PIPELINE START === +INFO:root:Loading CRM and clickstream data... +INFO:root:=== WEEK 2 PIPELINE START === +INFO:root:Loading CRM and clickstream data... +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv diff --git a/main.py b/main.py new file mode 100644 index 0000000..c4f3696 --- /dev/null +++ b/main.py @@ -0,0 +1,19 @@ +# main.py +from src.pipeline.unified_stream import load_and_merge +from src.pipeline.relative_date_trigger import apply_birthday_recursion +from src.pipeline.rf_scoring import calculate_rf_score +from src.pipeline.list_hygiene import apply_list_hygiene +from src.pipeline.ethics_flags import apply_ethics_checks +from src.pipeline.deliverability_check import run_deliverability_check + +print("=== WEEK 2 PIPELINE START ===") + +load_and_merge() +apply_birthday_recursion() +calculate_rf_score() +apply_list_hygiene() + +apply_ethics_checks() +run_deliverability_check() + +print("=== WEEK 2 PIPELINE COMPLETE ===") diff --git a/output/rf_scored_dataset.csv b/output/rf_scored_dataset.csv new file mode 100644 index 0000000..154cba3 --- /dev/null +++ b/output/rf_scored_dataset.csv @@ -0,0 +1,10 @@ +user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score +4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.034482758620689655 +3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.02040816326530612 +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.02564102564102564 +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.045454545454545456 +7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.006211180124223602 +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.015384615384615385 +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.07142857142857142 +6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04 +5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388345 diff --git a/output/unified_dataset.csv b/output/unified_dataset.csv new file mode 100644 index 0000000..3241e63 --- /dev/null +++ b/output/unified_dataset.csv @@ -0,0 +1,10 @@ +user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id +4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401 +3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301 +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101 +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102 +7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701 +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201 +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202 +6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601 +5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501 diff --git a/output/unified_dataset_cleaned.csv b/output/unified_dataset_cleaned.csv new file mode 100644 index 0000000..42d281b --- /dev/null +++ b/output/unified_dataset_cleaned.csv @@ -0,0 +1,10 @@ +user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score +4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.0344827586206896 +3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.0204081632653061 +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.0256410256410256 +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.0454545454545454 +7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.0062111801242236 +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.0153846153846153 +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.0714285714285714 +6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04 +5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388 diff --git a/output/unified_dataset_deliverable.csv b/output/unified_dataset_deliverable.csv new file mode 100644 index 0000000..90771ec --- /dev/null +++ b/output/unified_dataset_deliverable.csv @@ -0,0 +1,10 @@ +user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score,deliverable +4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.0344827586206896,True +3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.0204081632653061,True +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.0256410256410256,True +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.0454545454545454,True +7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.0062111801242236,True +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.0153846153846153,True +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.0714285714285714,True +6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04,True +5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388,True diff --git a/output/unified_dataset_ethics.csv b/output/unified_dataset_ethics.csv new file mode 100644 index 0000000..abbfeac --- /dev/null +++ b/output/unified_dataset_ethics.csv @@ -0,0 +1,10 @@ +user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score,sensitive_flag +4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.0344827586206896,False +3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.0204081632653061,False +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.0256410256410256,False +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.0454545454545454,False +7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.0062111801242236,False +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.0153846153846153,False +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.0714285714285714,False +6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04,False +5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388,False diff --git a/output/unified_dataset_with_triggers.csv b/output/unified_dataset_with_triggers.csv new file mode 100644 index 0000000..265c897 --- /dev/null +++ b/output/unified_dataset_with_triggers.csv @@ -0,0 +1,10 @@ +user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger +4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30 +3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21 +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17 +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17 +7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15 +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05 +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05 +6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01 +5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10 diff --git a/requirement.txt b/requirement.txt new file mode 100644 index 0000000..e69de29 diff --git a/sample_data/clickstream.csv b/sample_data/clickstream.csv new file mode 100644 index 0000000..473f134 --- /dev/null +++ b/sample_data/clickstream.csv @@ -0,0 +1,10 @@ +user_id,timestamp,page_depth,dwell_time,session_id +1,2026-01-15 09:15:00,5,120,101 +1,2026-02-01 10:20:00,3,90,102 +2,2025-12-20 14:10:00,4,60,201 +2,2026-02-10 08:50:00,6,180,202 +3,2026-01-05 12:30:00,2,30,301 +4,2026-01-25 16:45:00,3,60,401 +5,2025-08-01 11:00:00,5,150,501 +6,2026-01-30 09:00:00,4,120,601 +7,2025-09-15 17:10:00,2,45,701 diff --git a/sample_data/crm.csv b/sample_data/crm.csv new file mode 100644 index 0000000..6c6031c --- /dev/null +++ b/sample_data/crm.csv @@ -0,0 +1,8 @@ +user_id,email,birth_date,gender,consent_flag +1,alice@gmail.com,1990-02-17,F,True +2,bob@yahoo.com,1985-07-05,M,True +3,charlie@tempmail.com,2000-11-21,M,False +4,dana@outlook.com,1992-05-30,F,True +5,eric@mailinator.com,1988-09-10,M,True +6,frank@gmail.com,1995-12-01,M,True +7,grace@no-reply.com,1993-03-15,F,True diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/__pycache__/__init__.cpython-312.pyc b/src/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..1e388c6 Binary files /dev/null and b/src/__pycache__/__init__.cpython-312.pyc differ diff --git a/src/automation/__init__.py b/src/automation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/pipeline/__init__.py b/src/pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/pipeline/__pycache__/__init__.cpython-312.pyc b/src/pipeline/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..9f12bf5 Binary files /dev/null and b/src/pipeline/__pycache__/__init__.cpython-312.pyc differ diff --git a/src/pipeline/__pycache__/config_loader.cpython-312.pyc b/src/pipeline/__pycache__/config_loader.cpython-312.pyc new file mode 100644 index 0000000..62173ca Binary files /dev/null and b/src/pipeline/__pycache__/config_loader.cpython-312.pyc differ diff --git a/src/pipeline/__pycache__/deliverability_check.cpython-312.pyc b/src/pipeline/__pycache__/deliverability_check.cpython-312.pyc new file mode 100644 index 0000000..6f0ab3a Binary files /dev/null and b/src/pipeline/__pycache__/deliverability_check.cpython-312.pyc differ diff --git a/src/pipeline/__pycache__/ethics_flags.cpython-312.pyc b/src/pipeline/__pycache__/ethics_flags.cpython-312.pyc new file mode 100644 index 0000000..6f93269 Binary files /dev/null and b/src/pipeline/__pycache__/ethics_flags.cpython-312.pyc differ diff --git a/src/pipeline/__pycache__/list_hygiene.cpython-312.pyc b/src/pipeline/__pycache__/list_hygiene.cpython-312.pyc new file mode 100644 index 0000000..9119a49 Binary files /dev/null and b/src/pipeline/__pycache__/list_hygiene.cpython-312.pyc differ diff --git a/src/pipeline/__pycache__/relative_date_trigger.cpython-312.pyc b/src/pipeline/__pycache__/relative_date_trigger.cpython-312.pyc new file mode 100644 index 0000000..f048b2f Binary files /dev/null and b/src/pipeline/__pycache__/relative_date_trigger.cpython-312.pyc differ diff --git a/src/pipeline/__pycache__/rf_scoring.cpython-312.pyc b/src/pipeline/__pycache__/rf_scoring.cpython-312.pyc new file mode 100644 index 0000000..9741292 Binary files /dev/null and b/src/pipeline/__pycache__/rf_scoring.cpython-312.pyc differ diff --git a/src/pipeline/__pycache__/unified_stream.cpython-312.pyc b/src/pipeline/__pycache__/unified_stream.cpython-312.pyc new file mode 100644 index 0000000..f97b20a Binary files /dev/null and b/src/pipeline/__pycache__/unified_stream.cpython-312.pyc differ diff --git a/src/pipeline/config_loader.py b/src/pipeline/config_loader.py new file mode 100644 index 0000000..df6bac7 --- /dev/null +++ b/src/pipeline/config_loader.py @@ -0,0 +1,9 @@ +import os +import yaml + +# Project root +BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +config_path = os.path.join(BASE_DIR, "config.yaml") + +with open(config_path, "r") as f: + config = yaml.safe_load(f) diff --git a/src/pipeline/deliverability_check.py b/src/pipeline/deliverability_check.py new file mode 100644 index 0000000..6585405 --- /dev/null +++ b/src/pipeline/deliverability_check.py @@ -0,0 +1,15 @@ +# src/pipeline/deliverability_check.py +import pandas as pd +import os +from .config_loader import config + +def run_deliverability_check(): + input_file = os.path.join(config["output_dir"], "unified_dataset_cleaned.csv") + df = pd.read_csv(input_file) + + # Basic simulation of SPF/DKIM validation + df["deliverable"] = df["email"].str.contains("@") + + output_file = os.path.join(config["output_dir"], "unified_dataset_deliverable.csv") + df.to_csv(output_file, index=False) + return df diff --git a/src/pipeline/ethics_flags.py b/src/pipeline/ethics_flags.py new file mode 100644 index 0000000..688e41d --- /dev/null +++ b/src/pipeline/ethics_flags.py @@ -0,0 +1,27 @@ +# src/pipeline/ethics_flags.py +import pandas as pd +import os +from .config_loader import config + +def apply_ethics_checks(): + input_file = os.path.join(config["output_dir"], "unified_dataset_cleaned.csv") + df = pd.read_csv(input_file) + + # List of sensitive columns to check + sensitive_cols = ["medical_condition", "political_opinion", "dietary_restriction"] + + # Only include columns that exist in the dataset + existing_cols = [col for col in sensitive_cols if col in df.columns] + + # Apply sensitive flag if any sensitive info is present + if existing_cols: + df["sensitive_flag"] = df[existing_cols].notna().any(axis=1) + else: + # If none exist, create the flag with False + df["sensitive_flag"] = False + + # Save output + output_file = os.path.join(config["output_dir"], "unified_dataset_ethics.csv") + df.to_csv(output_file, index=False) + + return df diff --git a/src/pipeline/list_hygiene.py b/src/pipeline/list_hygiene.py new file mode 100644 index 0000000..fdf6af1 --- /dev/null +++ b/src/pipeline/list_hygiene.py @@ -0,0 +1,15 @@ +# src/pipeline/list_hygiene.py +import pandas as pd +import os +from .config_loader import config + +def apply_list_hygiene(): + input_file = os.path.join(config["output_dir"], "rf_scored_dataset.csv") + df = pd.read_csv(input_file) + + # Remove invalid domains, disposable emails + df = df[~df["email"].str.contains("example.com|test.com", na=False)] + + output_file = os.path.join(config["output_dir"], "unified_dataset_cleaned.csv") + df.to_csv(output_file, index=False) + return df diff --git a/src/pipeline/relative_date_trigger.py b/src/pipeline/relative_date_trigger.py new file mode 100644 index 0000000..b55995b --- /dev/null +++ b/src/pipeline/relative_date_trigger.py @@ -0,0 +1,15 @@ +# src/pipeline/relative_date_trigger.py +import pandas as pd +import os +from .config_loader import config + +def apply_birthday_recursion(): + input_file = os.path.join(config["output_dir"], "unified_dataset.csv") + df = pd.read_csv(input_file) + + # Example: 11-month pre-birthday trigger + df["birthday_trigger"] = pd.to_datetime(df["birth_date"]) - pd.DateOffset(months=1) + + output_file = os.path.join(config["output_dir"], "unified_dataset_with_triggers.csv") + df.to_csv(output_file, index=False) + return df diff --git a/src/pipeline/rf_scoring.py b/src/pipeline/rf_scoring.py new file mode 100644 index 0000000..ae80bcd --- /dev/null +++ b/src/pipeline/rf_scoring.py @@ -0,0 +1,18 @@ +# src/pipeline/rf_scoring.py +import pandas as pd +import os +from .config_loader import config + +def calculate_rf_score(): + input_file = os.path.join(config["output_dir"], "unified_dataset_with_triggers.csv") + df = pd.read_csv(input_file) + + # Simple RF scoring (Recency-Frequency) + now = pd.Timestamp.now(tz="UTC") + df["recency_index"] = (now - pd.to_datetime(df["timestamp"])).dt.days + + df["rf_score"] = 1 / (df["recency_index"] + 1) + df.get("session_count", 0) + + output_file = os.path.join(config["output_dir"], "rf_scored_dataset.csv") + df.to_csv(output_file, index=False) + return df diff --git a/src/pipeline/unified_stream.py b/src/pipeline/unified_stream.py new file mode 100644 index 0000000..d27f90d --- /dev/null +++ b/src/pipeline/unified_stream.py @@ -0,0 +1,32 @@ +# src/pipeline/unified_stream.py +import os +import pandas as pd +import hashlib +import logging +from .config_loader import config + +os.makedirs(config["output_dir"], exist_ok=True) +os.makedirs(config["log_dir"], exist_ok=True) +logging.basicConfig(filename=os.path.join(config["log_dir"], "pipeline.log"), + level=logging.INFO) + +def hash_user_id(user_id): + return hashlib.sha256(str(user_id).encode()).hexdigest() + +def load_and_merge(): + logging.info("Loading CRM and clickstream data...") + crm = pd.read_csv(config["crm_file"]) + click = pd.read_csv(config["clickstream_file"]) + + logging.info("Hashing user IDs for pseudonymization...") + crm["user_id_hashed"] = crm["user_id"].apply(hash_user_id) + click["user_id_hashed"] = click["user_id"].apply(hash_user_id) + + logging.info("Merging datasets...") + merged = pd.merge(crm, click, on="user_id_hashed", how="outer") + merged["timestamp"] = pd.to_datetime(merged["timestamp"], utc=True) + + output_file = os.path.join(config["output_dir"], "unified_dataset.csv") + merged.to_csv(output_file, index=False) + logging.info(f"Unified dataset saved to {output_file}") + return merged diff --git a/wiki/gdpr-audit.md b/wiki/gdpr-audit.md new file mode 100644 index 0000000..3ccea82 --- /dev/null +++ b/wiki/gdpr-audit.md @@ -0,0 +1,54 @@ +# \# GDPR Adequacy Checklist + +# + +# \## Governance \& Breach Protocols + +# \- DPO appointed and documented + +# \- 72-hour breach notification process defined + +# + +# \## Consent \& Data Collection + +# \- No pre-ticked marketing consent boxes + +# \- Explicit consent required for birthday listings + +# + +# \## Sensitive Data Handling + +# \- Medical data restricted to "unwell" status + +# \- Dietary data shared only with explicit consent + +# + +# \## International Data Transfers + +# \- Photo/video sharing audited for adequacy mechanisms + +# + + +## Data Protection Officer (DPO) +Name: [Your Name Here] +Role: [Your Role Here] +Contact: [email@example.com] +Date Appointed: [YYYY-MM-DD] + +## Data Protection Officer (DPO) +Name: +Role: +Contact: +Date Appointed: + +## Regulatory Feature Specifications + +### Right of Access (Article 15) +Users can request a downloadable summary of stored personal data. + +### Right of Erasure (Article 17) +Users can trigger full deletion across all systems and subprocessors.