week2
This commit is contained in:
4
config.yaml
Normal file
4
config.yaml
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
crm_file: "sample_data/crm.csv"
|
||||||
|
clickstream_file: "sample_data/clickstream.csv"
|
||||||
|
output_dir: "output"
|
||||||
|
log_dir: "logs"
|
||||||
64
logs/pipeline.log
Normal file
64
logs/pipeline.log
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
INFO:root:=== WEEK 2 PIPELINE START ===
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:=== WEEK 2 PIPELINE START ===
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:=== WEEK 2 PIPELINE START ===
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:=== WEEK 2 PIPELINE START ===
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:=== WEEK 2 PIPELINE START ===
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:=== WEEK 2 PIPELINE START ===
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
|
INFO:root:Loading CRM and clickstream data...
|
||||||
|
INFO:root:Hashing user IDs for pseudonymization...
|
||||||
|
INFO:root:Merging datasets...
|
||||||
|
INFO:root:Unified dataset saved to output\unified_dataset.csv
|
||||||
19
main.py
Normal file
19
main.py
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
# main.py
|
||||||
|
from src.pipeline.unified_stream import load_and_merge
|
||||||
|
from src.pipeline.relative_date_trigger import apply_birthday_recursion
|
||||||
|
from src.pipeline.rf_scoring import calculate_rf_score
|
||||||
|
from src.pipeline.list_hygiene import apply_list_hygiene
|
||||||
|
from src.pipeline.ethics_flags import apply_ethics_checks
|
||||||
|
from src.pipeline.deliverability_check import run_deliverability_check
|
||||||
|
|
||||||
|
print("=== WEEK 2 PIPELINE START ===")
|
||||||
|
|
||||||
|
load_and_merge()
|
||||||
|
apply_birthday_recursion()
|
||||||
|
calculate_rf_score()
|
||||||
|
apply_list_hygiene()
|
||||||
|
|
||||||
|
apply_ethics_checks()
|
||||||
|
run_deliverability_check()
|
||||||
|
|
||||||
|
print("=== WEEK 2 PIPELINE COMPLETE ===")
|
||||||
10
output/rf_scored_dataset.csv
Normal file
10
output/rf_scored_dataset.csv
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score
|
||||||
|
4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.034482758620689655
|
||||||
|
3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.02040816326530612
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.02564102564102564
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.045454545454545456
|
||||||
|
7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.006211180124223602
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.015384615384615385
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.07142857142857142
|
||||||
|
6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04
|
||||||
|
5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388345
|
||||||
|
10
output/unified_dataset.csv
Normal file
10
output/unified_dataset.csv
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id
|
||||||
|
4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401
|
||||||
|
3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102
|
||||||
|
7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202
|
||||||
|
6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601
|
||||||
|
5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501
|
||||||
|
10
output/unified_dataset_cleaned.csv
Normal file
10
output/unified_dataset_cleaned.csv
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score
|
||||||
|
4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.0344827586206896
|
||||||
|
3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.0204081632653061
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.0256410256410256
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.0454545454545454
|
||||||
|
7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.0062111801242236
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.0153846153846153
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.0714285714285714
|
||||||
|
6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04
|
||||||
|
5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388
|
||||||
|
10
output/unified_dataset_deliverable.csv
Normal file
10
output/unified_dataset_deliverable.csv
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score,deliverable
|
||||||
|
4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.0344827586206896,True
|
||||||
|
3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.0204081632653061,True
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.0256410256410256,True
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.0454545454545454,True
|
||||||
|
7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.0062111801242236,True
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.0153846153846153,True
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.0714285714285714,True
|
||||||
|
6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04,True
|
||||||
|
5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388,True
|
||||||
|
10
output/unified_dataset_ethics.csv
Normal file
10
output/unified_dataset_ethics.csv
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score,sensitive_flag
|
||||||
|
4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.0344827586206896,False
|
||||||
|
3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.0204081632653061,False
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.0256410256410256,False
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.0454545454545454,False
|
||||||
|
7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.0062111801242236,False
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.0153846153846153,False
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.0714285714285714,False
|
||||||
|
6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04,False
|
||||||
|
5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388,False
|
||||||
|
10
output/unified_dataset_with_triggers.csv
Normal file
10
output/unified_dataset_with_triggers.csv
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger
|
||||||
|
4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30
|
||||||
|
3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17
|
||||||
|
7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05
|
||||||
|
6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01
|
||||||
|
5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10
|
||||||
|
0
requirement.txt
Normal file
0
requirement.txt
Normal file
10
sample_data/clickstream.csv
Normal file
10
sample_data/clickstream.csv
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
user_id,timestamp,page_depth,dwell_time,session_id
|
||||||
|
1,2026-01-15 09:15:00,5,120,101
|
||||||
|
1,2026-02-01 10:20:00,3,90,102
|
||||||
|
2,2025-12-20 14:10:00,4,60,201
|
||||||
|
2,2026-02-10 08:50:00,6,180,202
|
||||||
|
3,2026-01-05 12:30:00,2,30,301
|
||||||
|
4,2026-01-25 16:45:00,3,60,401
|
||||||
|
5,2025-08-01 11:00:00,5,150,501
|
||||||
|
6,2026-01-30 09:00:00,4,120,601
|
||||||
|
7,2025-09-15 17:10:00,2,45,701
|
||||||
|
8
sample_data/crm.csv
Normal file
8
sample_data/crm.csv
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
user_id,email,birth_date,gender,consent_flag
|
||||||
|
1,alice@gmail.com,1990-02-17,F,True
|
||||||
|
2,bob@yahoo.com,1985-07-05,M,True
|
||||||
|
3,charlie@tempmail.com,2000-11-21,M,False
|
||||||
|
4,dana@outlook.com,1992-05-30,F,True
|
||||||
|
5,eric@mailinator.com,1988-09-10,M,True
|
||||||
|
6,frank@gmail.com,1995-12-01,M,True
|
||||||
|
7,grace@no-reply.com,1993-03-15,F,True
|
||||||
|
0
src/__init__.py
Normal file
0
src/__init__.py
Normal file
BIN
src/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
src/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
0
src/automation/__init__.py
Normal file
0
src/automation/__init__.py
Normal file
0
src/pipeline/__init__.py
Normal file
0
src/pipeline/__init__.py
Normal file
BIN
src/pipeline/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
src/pipeline/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/pipeline/__pycache__/config_loader.cpython-312.pyc
Normal file
BIN
src/pipeline/__pycache__/config_loader.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/pipeline/__pycache__/deliverability_check.cpython-312.pyc
Normal file
BIN
src/pipeline/__pycache__/deliverability_check.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/pipeline/__pycache__/ethics_flags.cpython-312.pyc
Normal file
BIN
src/pipeline/__pycache__/ethics_flags.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/pipeline/__pycache__/list_hygiene.cpython-312.pyc
Normal file
BIN
src/pipeline/__pycache__/list_hygiene.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/pipeline/__pycache__/relative_date_trigger.cpython-312.pyc
Normal file
BIN
src/pipeline/__pycache__/relative_date_trigger.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/pipeline/__pycache__/rf_scoring.cpython-312.pyc
Normal file
BIN
src/pipeline/__pycache__/rf_scoring.cpython-312.pyc
Normal file
Binary file not shown.
BIN
src/pipeline/__pycache__/unified_stream.cpython-312.pyc
Normal file
BIN
src/pipeline/__pycache__/unified_stream.cpython-312.pyc
Normal file
Binary file not shown.
9
src/pipeline/config_loader.py
Normal file
9
src/pipeline/config_loader.py
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
import os
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
# Project root
|
||||||
|
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
config_path = os.path.join(BASE_DIR, "config.yaml")
|
||||||
|
|
||||||
|
with open(config_path, "r") as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
15
src/pipeline/deliverability_check.py
Normal file
15
src/pipeline/deliverability_check.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
# src/pipeline/deliverability_check.py
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
from .config_loader import config
|
||||||
|
|
||||||
|
def run_deliverability_check():
|
||||||
|
input_file = os.path.join(config["output_dir"], "unified_dataset_cleaned.csv")
|
||||||
|
df = pd.read_csv(input_file)
|
||||||
|
|
||||||
|
# Basic simulation of SPF/DKIM validation
|
||||||
|
df["deliverable"] = df["email"].str.contains("@")
|
||||||
|
|
||||||
|
output_file = os.path.join(config["output_dir"], "unified_dataset_deliverable.csv")
|
||||||
|
df.to_csv(output_file, index=False)
|
||||||
|
return df
|
||||||
27
src/pipeline/ethics_flags.py
Normal file
27
src/pipeline/ethics_flags.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
# src/pipeline/ethics_flags.py
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
from .config_loader import config
|
||||||
|
|
||||||
|
def apply_ethics_checks():
|
||||||
|
input_file = os.path.join(config["output_dir"], "unified_dataset_cleaned.csv")
|
||||||
|
df = pd.read_csv(input_file)
|
||||||
|
|
||||||
|
# List of sensitive columns to check
|
||||||
|
sensitive_cols = ["medical_condition", "political_opinion", "dietary_restriction"]
|
||||||
|
|
||||||
|
# Only include columns that exist in the dataset
|
||||||
|
existing_cols = [col for col in sensitive_cols if col in df.columns]
|
||||||
|
|
||||||
|
# Apply sensitive flag if any sensitive info is present
|
||||||
|
if existing_cols:
|
||||||
|
df["sensitive_flag"] = df[existing_cols].notna().any(axis=1)
|
||||||
|
else:
|
||||||
|
# If none exist, create the flag with False
|
||||||
|
df["sensitive_flag"] = False
|
||||||
|
|
||||||
|
# Save output
|
||||||
|
output_file = os.path.join(config["output_dir"], "unified_dataset_ethics.csv")
|
||||||
|
df.to_csv(output_file, index=False)
|
||||||
|
|
||||||
|
return df
|
||||||
15
src/pipeline/list_hygiene.py
Normal file
15
src/pipeline/list_hygiene.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
# src/pipeline/list_hygiene.py
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
from .config_loader import config
|
||||||
|
|
||||||
|
def apply_list_hygiene():
|
||||||
|
input_file = os.path.join(config["output_dir"], "rf_scored_dataset.csv")
|
||||||
|
df = pd.read_csv(input_file)
|
||||||
|
|
||||||
|
# Remove invalid domains, disposable emails
|
||||||
|
df = df[~df["email"].str.contains("example.com|test.com", na=False)]
|
||||||
|
|
||||||
|
output_file = os.path.join(config["output_dir"], "unified_dataset_cleaned.csv")
|
||||||
|
df.to_csv(output_file, index=False)
|
||||||
|
return df
|
||||||
15
src/pipeline/relative_date_trigger.py
Normal file
15
src/pipeline/relative_date_trigger.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
# src/pipeline/relative_date_trigger.py
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
from .config_loader import config
|
||||||
|
|
||||||
|
def apply_birthday_recursion():
|
||||||
|
input_file = os.path.join(config["output_dir"], "unified_dataset.csv")
|
||||||
|
df = pd.read_csv(input_file)
|
||||||
|
|
||||||
|
# Example: 11-month pre-birthday trigger
|
||||||
|
df["birthday_trigger"] = pd.to_datetime(df["birth_date"]) - pd.DateOffset(months=1)
|
||||||
|
|
||||||
|
output_file = os.path.join(config["output_dir"], "unified_dataset_with_triggers.csv")
|
||||||
|
df.to_csv(output_file, index=False)
|
||||||
|
return df
|
||||||
18
src/pipeline/rf_scoring.py
Normal file
18
src/pipeline/rf_scoring.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
# src/pipeline/rf_scoring.py
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
from .config_loader import config
|
||||||
|
|
||||||
|
def calculate_rf_score():
|
||||||
|
input_file = os.path.join(config["output_dir"], "unified_dataset_with_triggers.csv")
|
||||||
|
df = pd.read_csv(input_file)
|
||||||
|
|
||||||
|
# Simple RF scoring (Recency-Frequency)
|
||||||
|
now = pd.Timestamp.now(tz="UTC")
|
||||||
|
df["recency_index"] = (now - pd.to_datetime(df["timestamp"])).dt.days
|
||||||
|
|
||||||
|
df["rf_score"] = 1 / (df["recency_index"] + 1) + df.get("session_count", 0)
|
||||||
|
|
||||||
|
output_file = os.path.join(config["output_dir"], "rf_scored_dataset.csv")
|
||||||
|
df.to_csv(output_file, index=False)
|
||||||
|
return df
|
||||||
32
src/pipeline/unified_stream.py
Normal file
32
src/pipeline/unified_stream.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
# src/pipeline/unified_stream.py
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
from .config_loader import config
|
||||||
|
|
||||||
|
os.makedirs(config["output_dir"], exist_ok=True)
|
||||||
|
os.makedirs(config["log_dir"], exist_ok=True)
|
||||||
|
logging.basicConfig(filename=os.path.join(config["log_dir"], "pipeline.log"),
|
||||||
|
level=logging.INFO)
|
||||||
|
|
||||||
|
def hash_user_id(user_id):
|
||||||
|
return hashlib.sha256(str(user_id).encode()).hexdigest()
|
||||||
|
|
||||||
|
def load_and_merge():
|
||||||
|
logging.info("Loading CRM and clickstream data...")
|
||||||
|
crm = pd.read_csv(config["crm_file"])
|
||||||
|
click = pd.read_csv(config["clickstream_file"])
|
||||||
|
|
||||||
|
logging.info("Hashing user IDs for pseudonymization...")
|
||||||
|
crm["user_id_hashed"] = crm["user_id"].apply(hash_user_id)
|
||||||
|
click["user_id_hashed"] = click["user_id"].apply(hash_user_id)
|
||||||
|
|
||||||
|
logging.info("Merging datasets...")
|
||||||
|
merged = pd.merge(crm, click, on="user_id_hashed", how="outer")
|
||||||
|
merged["timestamp"] = pd.to_datetime(merged["timestamp"], utc=True)
|
||||||
|
|
||||||
|
output_file = os.path.join(config["output_dir"], "unified_dataset.csv")
|
||||||
|
merged.to_csv(output_file, index=False)
|
||||||
|
logging.info(f"Unified dataset saved to {output_file}")
|
||||||
|
return merged
|
||||||
54
wiki/gdpr-audit.md
Normal file
54
wiki/gdpr-audit.md
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
# \# GDPR Adequacy Checklist
|
||||||
|
|
||||||
|
#
|
||||||
|
|
||||||
|
# \## Governance \& Breach Protocols
|
||||||
|
|
||||||
|
# \- DPO appointed and documented
|
||||||
|
|
||||||
|
# \- 72-hour breach notification process defined
|
||||||
|
|
||||||
|
#
|
||||||
|
|
||||||
|
# \## Consent \& Data Collection
|
||||||
|
|
||||||
|
# \- No pre-ticked marketing consent boxes
|
||||||
|
|
||||||
|
# \- Explicit consent required for birthday listings
|
||||||
|
|
||||||
|
#
|
||||||
|
|
||||||
|
# \## Sensitive Data Handling
|
||||||
|
|
||||||
|
# \- Medical data restricted to "unwell" status
|
||||||
|
|
||||||
|
# \- Dietary data shared only with explicit consent
|
||||||
|
|
||||||
|
#
|
||||||
|
|
||||||
|
# \## International Data Transfers
|
||||||
|
|
||||||
|
# \- Photo/video sharing audited for adequacy mechanisms
|
||||||
|
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
## Data Protection Officer (DPO)
|
||||||
|
Name: [Your Name Here]
|
||||||
|
Role: [Your Role Here]
|
||||||
|
Contact: [email@example.com]
|
||||||
|
Date Appointed: [YYYY-MM-DD]
|
||||||
|
|
||||||
|
## Data Protection Officer (DPO)
|
||||||
|
Name:
|
||||||
|
Role:
|
||||||
|
Contact:
|
||||||
|
Date Appointed:
|
||||||
|
|
||||||
|
## Regulatory Feature Specifications
|
||||||
|
|
||||||
|
### Right of Access (Article 15)
|
||||||
|
Users can request a downloadable summary of stored personal data.
|
||||||
|
|
||||||
|
### Right of Erasure (Article 17)
|
||||||
|
Users can trigger full deletion across all systems and subprocessors.
|
||||||
Reference in New Issue
Block a user