week2
This commit is contained in:
@@ -0,0 +1,4 @@
|
|||||||
|
crm_file: "sample_data/crm.csv"
|
||||||
|
clickstream_file: "sample_data/clickstream.csv"
|
||||||
|
output_dir: "output"
|
||||||
|
log_dir: "logs"
|
||||||
@@ -0,0 +1,64 @@
|
|||||||
|
INFO:root:=== WEEK 2 PIPELINE START ===
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:=== WEEK 2 PIPELINE START ===
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:=== WEEK 2 PIPELINE START ===
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:=== WEEK 2 PIPELINE START ===
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:=== WEEK 2 PIPELINE START ===
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:=== WEEK 2 PIPELINE START ===
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
@@ -0,0 +1,19 @@
|
|||||||
|
# main.py
|
||||||
|
from src.pipeline.unified_stream import load_and_merge
|
||||||
|
from src.pipeline.relative_date_trigger import apply_birthday_recursion
|
||||||
|
from src.pipeline.rf_scoring import calculate_rf_score
|
||||||
|
from src.pipeline.list_hygiene import apply_list_hygiene
|
||||||
|
from src.pipeline.ethics_flags import apply_ethics_checks
|
||||||
|
from src.pipeline.deliverability_check import run_deliverability_check
|
||||||
|
|
||||||
|
print("=== WEEK 2 PIPELINE START ===")
|
||||||
|
|
||||||
|
load_and_merge()
|
||||||
|
apply_birthday_recursion()
|
||||||
|
calculate_rf_score()
|
||||||
|
apply_list_hygiene()
|
||||||
|
|
||||||
|
apply_ethics_checks()
|
||||||
|
run_deliverability_check()
|
||||||
|
|
||||||
|
print("=== WEEK 2 PIPELINE COMPLETE ===")
|
||||||
@@ -0,0 +1,10 @@
|
|||||||
|
user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score
|
||||||
|
4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.034482758620689655
|
||||||
|
3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.02040816326530612
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.02564102564102564
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.045454545454545456
|
||||||
|
7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.006211180124223602
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.015384615384615385
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.07142857142857142
|
||||||
|
6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04
|
||||||
|
5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388345
|
||||||
|
@@ -0,0 +1,10 @@
|
|||||||
|
user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id
|
||||||
|
4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401
|
||||||
|
3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102
|
||||||
|
7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202
|
||||||
|
6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601
|
||||||
|
5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501
|
||||||
|
@@ -0,0 +1,10 @@
|
|||||||
|
user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score
|
||||||
|
4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.0344827586206896
|
||||||
|
3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.0204081632653061
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.0256410256410256
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.0454545454545454
|
||||||
|
7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.0062111801242236
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.0153846153846153
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.0714285714285714
|
||||||
|
6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04
|
||||||
|
5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388
|
||||||
|
@@ -0,0 +1,10 @@
|
|||||||
|
user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score,deliverable
|
||||||
|
4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.0344827586206896,True
|
||||||
|
3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.0204081632653061,True
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.0256410256410256,True
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.0454545454545454,True
|
||||||
|
7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.0062111801242236,True
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.0153846153846153,True
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.0714285714285714,True
|
||||||
|
6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04,True
|
||||||
|
5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388,True
|
||||||
|
@@ -0,0 +1,10 @@
|
|||||||
|
user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score,sensitive_flag
|
||||||
|
4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.0344827586206896,False
|
||||||
|
3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.0204081632653061,False
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.0256410256410256,False
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.0454545454545454,False
|
||||||
|
7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.0062111801242236,False
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.0153846153846153,False
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.0714285714285714,False
|
||||||
|
6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04,False
|
||||||
|
5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388,False
|
||||||
|
@@ -0,0 +1,10 @@
|
|||||||
|
user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger
|
||||||
|
4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30
|
||||||
|
3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17
|
||||||
|
7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05
|
||||||
|
6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01
|
||||||
|
5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10
|
||||||
|
@@ -0,0 +1,10 @@
|
|||||||
|
user_id,timestamp,page_depth,dwell_time,session_id
|
||||||
|
1,2026-01-15 09:15:00,5,120,101
|
||||||
|
1,2026-02-01 10:20:00,3,90,102
|
||||||
|
2,2025-12-20 14:10:00,4,60,201
|
||||||
|
2,2026-02-10 08:50:00,6,180,202
|
||||||
|
3,2026-01-05 12:30:00,2,30,301
|
||||||
|
4,2026-01-25 16:45:00,3,60,401
|
||||||
|
5,2025-08-01 11:00:00,5,150,501
|
||||||
|
6,2026-01-30 09:00:00,4,120,601
|
||||||
|
7,2025-09-15 17:10:00,2,45,701
|
||||||
|
@@ -0,0 +1,8 @@
|
|||||||
|
user_id,email,birth_date,gender,consent_flag
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True
|
||||||
|
3,charlie@tempmail.com,2000-11-21,M,False
|
||||||
|
4,dana@outlook.com,1992-05-30,F,True
|
||||||
|
5,eric@mailinator.com,1988-09-10,M,True
|
||||||
|
6,frank@gmail.com,1995-12-01,M,True
|
||||||
|
7,grace@no-reply.com,1993-03-15,F,True
|
||||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,9 @@
|
|||||||
|
import os
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
# Project root
|
||||||
|
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
config_path = os.path.join(BASE_DIR, "config.yaml")
|
||||||
|
|
||||||
|
with open(config_path, "r") as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
# src/pipeline/deliverability_check.py
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
from .config_loader import config
|
||||||
|
|
||||||
|
def run_deliverability_check():
|
||||||
|
input_file = os.path.join(config["output_dir"], "unified_dataset_cleaned.csv")
|
||||||
|
df = pd.read_csv(input_file)
|
||||||
|
|
||||||
|
# Basic simulation of SPF/DKIM validation
|
||||||
|
df["deliverable"] = df["email"].str.contains("@")
|
||||||
|
|
||||||
|
output_file = os.path.join(config["output_dir"], "unified_dataset_deliverable.csv")
|
||||||
|
df.to_csv(output_file, index=False)
|
||||||
|
return df
|
||||||
@@ -0,0 +1,27 @@
|
|||||||
|
# src/pipeline/ethics_flags.py
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
from .config_loader import config
|
||||||
|
|
||||||
|
def apply_ethics_checks():
|
||||||
|
input_file = os.path.join(config["output_dir"], "unified_dataset_cleaned.csv")
|
||||||
|
df = pd.read_csv(input_file)
|
||||||
|
|
||||||
|
# List of sensitive columns to check
|
||||||
|
sensitive_cols = ["medical_condition", "political_opinion", "dietary_restriction"]
|
||||||
|
|
||||||
|
# Only include columns that exist in the dataset
|
||||||
|
existing_cols = [col for col in sensitive_cols if col in df.columns]
|
||||||
|
|
||||||
|
# Apply sensitive flag if any sensitive info is present
|
||||||
|
if existing_cols:
|
||||||
|
df["sensitive_flag"] = df[existing_cols].notna().any(axis=1)
|
||||||
|
else:
|
||||||
|
# If none exist, create the flag with False
|
||||||
|
df["sensitive_flag"] = False
|
||||||
|
|
||||||
|
# Save output
|
||||||
|
output_file = os.path.join(config["output_dir"], "unified_dataset_ethics.csv")
|
||||||
|
df.to_csv(output_file, index=False)
|
||||||
|
|
||||||
|
return df
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
# src/pipeline/list_hygiene.py
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
from .config_loader import config
|
||||||
|
|
||||||
|
def apply_list_hygiene():
|
||||||
|
input_file = os.path.join(config["output_dir"], "rf_scored_dataset.csv")
|
||||||
|
df = pd.read_csv(input_file)
|
||||||
|
|
||||||
|
# Remove invalid domains, disposable emails
|
||||||
|
df = df[~df["email"].str.contains("example.com|test.com", na=False)]
|
||||||
|
|
||||||
|
output_file = os.path.join(config["output_dir"], "unified_dataset_cleaned.csv")
|
||||||
|
df.to_csv(output_file, index=False)
|
||||||
|
return df
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
# src/pipeline/relative_date_trigger.py
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
from .config_loader import config
|
||||||
|
|
||||||
|
def apply_birthday_recursion():
|
||||||
|
input_file = os.path.join(config["output_dir"], "unified_dataset.csv")
|
||||||
|
df = pd.read_csv(input_file)
|
||||||
|
|
||||||
|
# Example: 11-month pre-birthday trigger
|
||||||
|
df["birthday_trigger"] = pd.to_datetime(df["birth_date"]) - pd.DateOffset(months=1)
|
||||||
|
|
||||||
|
output_file = os.path.join(config["output_dir"], "unified_dataset_with_triggers.csv")
|
||||||
|
df.to_csv(output_file, index=False)
|
||||||
|
return df
|
||||||
@@ -0,0 +1,18 @@
|
|||||||
|
# src/pipeline/rf_scoring.py
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
from .config_loader import config
|
||||||
|
|
||||||
|
def calculate_rf_score():
|
||||||
|
input_file = os.path.join(config["output_dir"], "unified_dataset_with_triggers.csv")
|
||||||
|
df = pd.read_csv(input_file)
|
||||||
|
|
||||||
|
# Simple RF scoring (Recency-Frequency)
|
||||||
|
now = pd.Timestamp.now(tz="UTC")
|
||||||
|
df["recency_index"] = (now - pd.to_datetime(df["timestamp"])).dt.days
|
||||||
|
|
||||||
|
df["rf_score"] = 1 / (df["recency_index"] + 1) + df.get("session_count", 0)
|
||||||
|
|
||||||
|
output_file = os.path.join(config["output_dir"], "rf_scored_dataset.csv")
|
||||||
|
df.to_csv(output_file, index=False)
|
||||||
|
return df
|
||||||
@@ -0,0 +1,32 @@
|
|||||||
|
# src/pipeline/unified_stream.py
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
from .config_loader import config
|
||||||
|
|
||||||
|
os.makedirs(config["output_dir"], exist_ok=True)
|
||||||
|
os.makedirs(config["log_dir"], exist_ok=True)
|
||||||
|
logging.basicConfig(filename=os.path.join(config["log_dir"], "pipeline.log"),
|
||||||
|
level=logging.INFO)
|
||||||
|
|
||||||
|
def hash_user_id(user_id):
|
||||||
|
return hashlib.sha256(str(user_id).encode()).hexdigest()
|
||||||
|
|
||||||
|
def load_and_merge():
|
||||||
|
logging.info("Loading CRM and clickstream data...")
|
||||||
|
crm = pd.read_csv(config["crm_file"])
|
||||||
|
click = pd.read_csv(config["clickstream_file"])
|
||||||
|
|
||||||
|
logging.info("Hashing user IDs for pseudonymization...")
|
||||||
|
crm["user_id_hashed"] = crm["user_id"].apply(hash_user_id)
|
||||||
|
click["user_id_hashed"] = click["user_id"].apply(hash_user_id)
|
||||||
|
|
||||||
|
logging.info("Merging datasets...")
|
||||||
|
merged = pd.merge(crm, click, on="user_id_hashed", how="outer")
|
||||||
|
merged["timestamp"] = pd.to_datetime(merged["timestamp"], utc=True)
|
||||||
|
|
||||||
|
output_file = os.path.join(config["output_dir"], "unified_dataset.csv")
|
||||||
|
merged.to_csv(output_file, index=False)
|
||||||
|
logging.info(f"Unified dataset saved to {output_file}")
|
||||||
|
return merged
|
||||||
@@ -0,0 +1,54 @@
|
|||||||
|
# \# GDPR Adequacy Checklist
|
||||||
|
|
||||||
|
#
|
||||||
|
|
||||||
|
# \## Governance \& Breach Protocols
|
||||||
|
|
||||||
|
# \- DPO appointed and documented
|
||||||
|
|
||||||
|
# \- 72-hour breach notification process defined
|
||||||
|
|
||||||
|
#
|
||||||
|
|
||||||
|
# \## Consent \& Data Collection
|
||||||
|
|
||||||
|
# \- No pre-ticked marketing consent boxes
|
||||||
|
|
||||||
|
# \- Explicit consent required for birthday listings
|
||||||
|
|
||||||
|
#
|
||||||
|
|
||||||
|
# \## Sensitive Data Handling
|
||||||
|
|
||||||
|
# \- Medical data restricted to "unwell" status
|
||||||
|
|
||||||
|
# \- Dietary data shared only with explicit consent
|
||||||
|
|
||||||
|
#
|
||||||
|
|
||||||
|
# \## International Data Transfers
|
||||||
|
|
||||||
|
# \- Photo/video sharing audited for adequacy mechanisms
|
||||||
|
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
## Data Protection Officer (DPO)
|
||||||
|
Name: [Your Name Here]
|
||||||
|
Role: [Your Role Here]
|
||||||
|
Contact: [email@example.com]
|
||||||
|
Date Appointed: [YYYY-MM-DD]
|
||||||
|
|
||||||
|
## Data Protection Officer (DPO)
|
||||||
|
Name:
|
||||||
|
Role:
|
||||||
|
Contact:
|
||||||
|
Date Appointed:
|
||||||
|
|
||||||
|
## Regulatory Feature Specifications
|
||||||
|
|
||||||
|
### Right of Access (Article 15)
|
||||||
|
Users can request a downloadable summary of stored personal data.
|
||||||
|
|
||||||
|
### Right of Erasure (Article 17)
|
||||||
|
Users can trigger full deletion across all systems and subprocessors.
|
||||||
Reference in New Issue
Block a user