This commit is contained in:
2026-02-25 13:14:56 +09:30
parent 3eb0d1f1e4
commit 315bafd6dc
32 changed files with 350 additions and 0 deletions

4
config.yaml Normal file
View File

@@ -0,0 +1,4 @@
crm_file: "sample_data/crm.csv"
clickstream_file: "sample_data/clickstream.csv"
output_dir: "output"
log_dir: "logs"

64
logs/pipeline.log Normal file
View File

@@ -0,0 +1,64 @@
INFO:root:=== WEEK 2 PIPELINE START ===
INFO:root:Loading CRM and clickstream data...
INFO:root:=== WEEK 2 PIPELINE START ===
INFO:root:Loading CRM and clickstream data...
INFO:root:=== WEEK 2 PIPELINE START ===
INFO:root:Loading CRM and clickstream data...
INFO:root:=== WEEK 2 PIPELINE START ===
INFO:root:Loading CRM and clickstream data...
INFO:root:=== WEEK 2 PIPELINE START ===
INFO:root:Loading CRM and clickstream data...
INFO:root:=== WEEK 2 PIPELINE START ===
INFO:root:Loading CRM and clickstream data...
INFO:root:Loading CRM and clickstream data...
INFO:root:Hashing user IDs for pseudonymization...
INFO:root:Merging datasets...
INFO:root:Unified dataset saved to output\unified_dataset.csv
INFO:root:Loading CRM and clickstream data...
INFO:root:Hashing user IDs for pseudonymization...
INFO:root:Merging datasets...
INFO:root:Unified dataset saved to output\unified_dataset.csv
INFO:root:Loading CRM and clickstream data...
INFO:root:Hashing user IDs for pseudonymization...
INFO:root:Merging datasets...
INFO:root:Unified dataset saved to output\unified_dataset.csv
INFO:root:Loading CRM and clickstream data...
INFO:root:Hashing user IDs for pseudonymization...
INFO:root:Merging datasets...
INFO:root:Unified dataset saved to output\unified_dataset.csv
INFO:root:Loading CRM and clickstream data...
INFO:root:Hashing user IDs for pseudonymization...
INFO:root:Merging datasets...
INFO:root:Unified dataset saved to output\unified_dataset.csv
INFO:root:Loading CRM and clickstream data...
INFO:root:Hashing user IDs for pseudonymization...
INFO:root:Merging datasets...
INFO:root:Unified dataset saved to output\unified_dataset.csv
INFO:root:Loading CRM and clickstream data...
INFO:root:Hashing user IDs for pseudonymization...
INFO:root:Merging datasets...
INFO:root:Unified dataset saved to output\unified_dataset.csv
INFO:root:Loading CRM and clickstream data...
INFO:root:Hashing user IDs for pseudonymization...
INFO:root:Merging datasets...
INFO:root:Unified dataset saved to output\unified_dataset.csv
INFO:root:Loading CRM and clickstream data...
INFO:root:Hashing user IDs for pseudonymization...
INFO:root:Merging datasets...
INFO:root:Unified dataset saved to output\unified_dataset.csv
INFO:root:Loading CRM and clickstream data...
INFO:root:Hashing user IDs for pseudonymization...
INFO:root:Merging datasets...
INFO:root:Unified dataset saved to output\unified_dataset.csv
INFO:root:Loading CRM and clickstream data...
INFO:root:Hashing user IDs for pseudonymization...
INFO:root:Merging datasets...
INFO:root:Unified dataset saved to output\unified_dataset.csv
INFO:root:Loading CRM and clickstream data...
INFO:root:Hashing user IDs for pseudonymization...
INFO:root:Merging datasets...
INFO:root:Unified dataset saved to output\unified_dataset.csv
INFO:root:Loading CRM and clickstream data...
INFO:root:Hashing user IDs for pseudonymization...
INFO:root:Merging datasets...
INFO:root:Unified dataset saved to output\unified_dataset.csv

19
main.py Normal file
View File

@@ -0,0 +1,19 @@
# main.py
from src.pipeline.unified_stream import load_and_merge
from src.pipeline.relative_date_trigger import apply_birthday_recursion
from src.pipeline.rf_scoring import calculate_rf_score
from src.pipeline.list_hygiene import apply_list_hygiene
from src.pipeline.ethics_flags import apply_ethics_checks
from src.pipeline.deliverability_check import run_deliverability_check
print("=== WEEK 2 PIPELINE START ===")
load_and_merge()
apply_birthday_recursion()
calculate_rf_score()
apply_list_hygiene()
apply_ethics_checks()
run_deliverability_check()
print("=== WEEK 2 PIPELINE COMPLETE ===")

View File

@@ -0,0 +1,10 @@
user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score
4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.034482758620689655
3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.02040816326530612
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.02564102564102564
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.045454545454545456
7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.006211180124223602
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.015384615384615385
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.07142857142857142
6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04
5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388345
1 user_id_x email birth_date gender consent_flag user_id_hashed user_id_y timestamp page_depth dwell_time session_id birthday_trigger recency_index rf_score
2 4 dana@outlook.com 1992-05-30 F True 4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a 4 2026-01-25 16:45:00+00:00 3 60 401 1992-04-30 28 0.034482758620689655
3 3 charlie@tempmail.com 2000-11-21 M False 4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce 3 2026-01-05 12:30:00+00:00 2 30 301 2000-10-21 48 0.02040816326530612
4 1 alice@gmail.com 1990-02-17 F True 6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b 1 2026-01-15 09:15:00+00:00 5 120 101 1990-01-17 38 0.02564102564102564
5 1 alice@gmail.com 1990-02-17 F True 6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b 1 2026-02-01 10:20:00+00:00 3 90 102 1990-01-17 21 0.045454545454545456
6 7 grace@no-reply.com 1993-03-15 F True 7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451 7 2025-09-15 17:10:00+00:00 2 45 701 1993-02-15 160 0.006211180124223602
7 2 bob@yahoo.com 1985-07-05 M True d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35 2 2025-12-20 14:10:00+00:00 4 60 201 1985-06-05 64 0.015384615384615385
8 2 bob@yahoo.com 1985-07-05 M True d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35 2 2026-02-10 08:50:00+00:00 6 180 202 1985-06-05 13 0.07142857142857142
9 6 frank@gmail.com 1995-12-01 M True e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683 6 2026-01-30 09:00:00+00:00 4 120 601 1995-11-01 24 0.04
10 5 eric@mailinator.com 1988-09-10 M True ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d 5 2025-08-01 11:00:00+00:00 5 150 501 1988-08-10 205 0.0048543689320388345

View File

@@ -0,0 +1,10 @@
user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id
4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401
3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102
7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202
6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601
5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501
1 user_id_x email birth_date gender consent_flag user_id_hashed user_id_y timestamp page_depth dwell_time session_id
2 4 dana@outlook.com 1992-05-30 F True 4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a 4 2026-01-25 16:45:00+00:00 3 60 401
3 3 charlie@tempmail.com 2000-11-21 M False 4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce 3 2026-01-05 12:30:00+00:00 2 30 301
4 1 alice@gmail.com 1990-02-17 F True 6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b 1 2026-01-15 09:15:00+00:00 5 120 101
5 1 alice@gmail.com 1990-02-17 F True 6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b 1 2026-02-01 10:20:00+00:00 3 90 102
6 7 grace@no-reply.com 1993-03-15 F True 7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451 7 2025-09-15 17:10:00+00:00 2 45 701
7 2 bob@yahoo.com 1985-07-05 M True d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35 2 2025-12-20 14:10:00+00:00 4 60 201
8 2 bob@yahoo.com 1985-07-05 M True d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35 2 2026-02-10 08:50:00+00:00 6 180 202
9 6 frank@gmail.com 1995-12-01 M True e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683 6 2026-01-30 09:00:00+00:00 4 120 601
10 5 eric@mailinator.com 1988-09-10 M True ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d 5 2025-08-01 11:00:00+00:00 5 150 501

View File

@@ -0,0 +1,10 @@
user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score
4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.0344827586206896
3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.0204081632653061
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.0256410256410256
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.0454545454545454
7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.0062111801242236
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.0153846153846153
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.0714285714285714
6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04
5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388
1 user_id_x email birth_date gender consent_flag user_id_hashed user_id_y timestamp page_depth dwell_time session_id birthday_trigger recency_index rf_score
2 4 dana@outlook.com 1992-05-30 F True 4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a 4 2026-01-25 16:45:00+00:00 3 60 401 1992-04-30 28 0.0344827586206896
3 3 charlie@tempmail.com 2000-11-21 M False 4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce 3 2026-01-05 12:30:00+00:00 2 30 301 2000-10-21 48 0.0204081632653061
4 1 alice@gmail.com 1990-02-17 F True 6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b 1 2026-01-15 09:15:00+00:00 5 120 101 1990-01-17 38 0.0256410256410256
5 1 alice@gmail.com 1990-02-17 F True 6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b 1 2026-02-01 10:20:00+00:00 3 90 102 1990-01-17 21 0.0454545454545454
6 7 grace@no-reply.com 1993-03-15 F True 7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451 7 2025-09-15 17:10:00+00:00 2 45 701 1993-02-15 160 0.0062111801242236
7 2 bob@yahoo.com 1985-07-05 M True d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35 2 2025-12-20 14:10:00+00:00 4 60 201 1985-06-05 64 0.0153846153846153
8 2 bob@yahoo.com 1985-07-05 M True d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35 2 2026-02-10 08:50:00+00:00 6 180 202 1985-06-05 13 0.0714285714285714
9 6 frank@gmail.com 1995-12-01 M True e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683 6 2026-01-30 09:00:00+00:00 4 120 601 1995-11-01 24 0.04
10 5 eric@mailinator.com 1988-09-10 M True ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d 5 2025-08-01 11:00:00+00:00 5 150 501 1988-08-10 205 0.0048543689320388

View File

@@ -0,0 +1,10 @@
user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score,deliverable
4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.0344827586206896,True
3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.0204081632653061,True
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.0256410256410256,True
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.0454545454545454,True
7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.0062111801242236,True
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.0153846153846153,True
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.0714285714285714,True
6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04,True
5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388,True
1 user_id_x email birth_date gender consent_flag user_id_hashed user_id_y timestamp page_depth dwell_time session_id birthday_trigger recency_index rf_score deliverable
2 4 dana@outlook.com 1992-05-30 F True 4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a 4 2026-01-25 16:45:00+00:00 3 60 401 1992-04-30 28 0.0344827586206896 True
3 3 charlie@tempmail.com 2000-11-21 M False 4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce 3 2026-01-05 12:30:00+00:00 2 30 301 2000-10-21 48 0.0204081632653061 True
4 1 alice@gmail.com 1990-02-17 F True 6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b 1 2026-01-15 09:15:00+00:00 5 120 101 1990-01-17 38 0.0256410256410256 True
5 1 alice@gmail.com 1990-02-17 F True 6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b 1 2026-02-01 10:20:00+00:00 3 90 102 1990-01-17 21 0.0454545454545454 True
6 7 grace@no-reply.com 1993-03-15 F True 7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451 7 2025-09-15 17:10:00+00:00 2 45 701 1993-02-15 160 0.0062111801242236 True
7 2 bob@yahoo.com 1985-07-05 M True d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35 2 2025-12-20 14:10:00+00:00 4 60 201 1985-06-05 64 0.0153846153846153 True
8 2 bob@yahoo.com 1985-07-05 M True d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35 2 2026-02-10 08:50:00+00:00 6 180 202 1985-06-05 13 0.0714285714285714 True
9 6 frank@gmail.com 1995-12-01 M True e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683 6 2026-01-30 09:00:00+00:00 4 120 601 1995-11-01 24 0.04 True
10 5 eric@mailinator.com 1988-09-10 M True ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d 5 2025-08-01 11:00:00+00:00 5 150 501 1988-08-10 205 0.0048543689320388 True

View File

@@ -0,0 +1,10 @@
user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score,sensitive_flag
4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.0344827586206896,False
3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.0204081632653061,False
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.0256410256410256,False
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.0454545454545454,False
7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.0062111801242236,False
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.0153846153846153,False
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.0714285714285714,False
6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04,False
5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388,False
1 user_id_x email birth_date gender consent_flag user_id_hashed user_id_y timestamp page_depth dwell_time session_id birthday_trigger recency_index rf_score sensitive_flag
2 4 dana@outlook.com 1992-05-30 F True 4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a 4 2026-01-25 16:45:00+00:00 3 60 401 1992-04-30 28 0.0344827586206896 False
3 3 charlie@tempmail.com 2000-11-21 M False 4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce 3 2026-01-05 12:30:00+00:00 2 30 301 2000-10-21 48 0.0204081632653061 False
4 1 alice@gmail.com 1990-02-17 F True 6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b 1 2026-01-15 09:15:00+00:00 5 120 101 1990-01-17 38 0.0256410256410256 False
5 1 alice@gmail.com 1990-02-17 F True 6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b 1 2026-02-01 10:20:00+00:00 3 90 102 1990-01-17 21 0.0454545454545454 False
6 7 grace@no-reply.com 1993-03-15 F True 7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451 7 2025-09-15 17:10:00+00:00 2 45 701 1993-02-15 160 0.0062111801242236 False
7 2 bob@yahoo.com 1985-07-05 M True d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35 2 2025-12-20 14:10:00+00:00 4 60 201 1985-06-05 64 0.0153846153846153 False
8 2 bob@yahoo.com 1985-07-05 M True d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35 2 2026-02-10 08:50:00+00:00 6 180 202 1985-06-05 13 0.0714285714285714 False
9 6 frank@gmail.com 1995-12-01 M True e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683 6 2026-01-30 09:00:00+00:00 4 120 601 1995-11-01 24 0.04 False
10 5 eric@mailinator.com 1988-09-10 M True ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d 5 2025-08-01 11:00:00+00:00 5 150 501 1988-08-10 205 0.0048543689320388 False

View File

@@ -0,0 +1,10 @@
user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger
4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30
3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17
1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17
7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05
2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05
6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01
5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10
1 user_id_x email birth_date gender consent_flag user_id_hashed user_id_y timestamp page_depth dwell_time session_id birthday_trigger
2 4 dana@outlook.com 1992-05-30 F True 4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a 4 2026-01-25 16:45:00+00:00 3 60 401 1992-04-30
3 3 charlie@tempmail.com 2000-11-21 M False 4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce 3 2026-01-05 12:30:00+00:00 2 30 301 2000-10-21
4 1 alice@gmail.com 1990-02-17 F True 6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b 1 2026-01-15 09:15:00+00:00 5 120 101 1990-01-17
5 1 alice@gmail.com 1990-02-17 F True 6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b 1 2026-02-01 10:20:00+00:00 3 90 102 1990-01-17
6 7 grace@no-reply.com 1993-03-15 F True 7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451 7 2025-09-15 17:10:00+00:00 2 45 701 1993-02-15
7 2 bob@yahoo.com 1985-07-05 M True d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35 2 2025-12-20 14:10:00+00:00 4 60 201 1985-06-05
8 2 bob@yahoo.com 1985-07-05 M True d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35 2 2026-02-10 08:50:00+00:00 6 180 202 1985-06-05
9 6 frank@gmail.com 1995-12-01 M True e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683 6 2026-01-30 09:00:00+00:00 4 120 601 1995-11-01
10 5 eric@mailinator.com 1988-09-10 M True ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d 5 2025-08-01 11:00:00+00:00 5 150 501 1988-08-10

0
requirement.txt Normal file
View File

View File

@@ -0,0 +1,10 @@
user_id,timestamp,page_depth,dwell_time,session_id
1,2026-01-15 09:15:00,5,120,101
1,2026-02-01 10:20:00,3,90,102
2,2025-12-20 14:10:00,4,60,201
2,2026-02-10 08:50:00,6,180,202
3,2026-01-05 12:30:00,2,30,301
4,2026-01-25 16:45:00,3,60,401
5,2025-08-01 11:00:00,5,150,501
6,2026-01-30 09:00:00,4,120,601
7,2025-09-15 17:10:00,2,45,701
1 user_id timestamp page_depth dwell_time session_id
2 1 2026-01-15 09:15:00 5 120 101
3 1 2026-02-01 10:20:00 3 90 102
4 2 2025-12-20 14:10:00 4 60 201
5 2 2026-02-10 08:50:00 6 180 202
6 3 2026-01-05 12:30:00 2 30 301
7 4 2026-01-25 16:45:00 3 60 401
8 5 2025-08-01 11:00:00 5 150 501
9 6 2026-01-30 09:00:00 4 120 601
10 7 2025-09-15 17:10:00 2 45 701

8
sample_data/crm.csv Normal file
View File

@@ -0,0 +1,8 @@
user_id,email,birth_date,gender,consent_flag
1,alice@gmail.com,1990-02-17,F,True
2,bob@yahoo.com,1985-07-05,M,True
3,charlie@tempmail.com,2000-11-21,M,False
4,dana@outlook.com,1992-05-30,F,True
5,eric@mailinator.com,1988-09-10,M,True
6,frank@gmail.com,1995-12-01,M,True
7,grace@no-reply.com,1993-03-15,F,True
1 user_id email birth_date gender consent_flag
2 1 alice@gmail.com 1990-02-17 F True
3 2 bob@yahoo.com 1985-07-05 M True
4 3 charlie@tempmail.com 2000-11-21 M False
5 4 dana@outlook.com 1992-05-30 F True
6 5 eric@mailinator.com 1988-09-10 M True
7 6 frank@gmail.com 1995-12-01 M True
8 7 grace@no-reply.com 1993-03-15 F True

0
src/__init__.py Normal file
View File

Binary file not shown.

View File

0
src/pipeline/__init__.py Normal file
View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,9 @@
import os
import yaml
# Project root
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
config_path = os.path.join(BASE_DIR, "config.yaml")
with open(config_path, "r") as f:
config = yaml.safe_load(f)

View File

@@ -0,0 +1,15 @@
# src/pipeline/deliverability_check.py
import pandas as pd
import os
from .config_loader import config
def run_deliverability_check():
input_file = os.path.join(config["output_dir"], "unified_dataset_cleaned.csv")
df = pd.read_csv(input_file)
# Basic simulation of SPF/DKIM validation
df["deliverable"] = df["email"].str.contains("@")
output_file = os.path.join(config["output_dir"], "unified_dataset_deliverable.csv")
df.to_csv(output_file, index=False)
return df

View File

@@ -0,0 +1,27 @@
# src/pipeline/ethics_flags.py
import pandas as pd
import os
from .config_loader import config
def apply_ethics_checks():
input_file = os.path.join(config["output_dir"], "unified_dataset_cleaned.csv")
df = pd.read_csv(input_file)
# List of sensitive columns to check
sensitive_cols = ["medical_condition", "political_opinion", "dietary_restriction"]
# Only include columns that exist in the dataset
existing_cols = [col for col in sensitive_cols if col in df.columns]
# Apply sensitive flag if any sensitive info is present
if existing_cols:
df["sensitive_flag"] = df[existing_cols].notna().any(axis=1)
else:
# If none exist, create the flag with False
df["sensitive_flag"] = False
# Save output
output_file = os.path.join(config["output_dir"], "unified_dataset_ethics.csv")
df.to_csv(output_file, index=False)
return df

View File

@@ -0,0 +1,15 @@
# src/pipeline/list_hygiene.py
import pandas as pd
import os
from .config_loader import config
def apply_list_hygiene():
input_file = os.path.join(config["output_dir"], "rf_scored_dataset.csv")
df = pd.read_csv(input_file)
# Remove invalid domains, disposable emails
df = df[~df["email"].str.contains("example.com|test.com", na=False)]
output_file = os.path.join(config["output_dir"], "unified_dataset_cleaned.csv")
df.to_csv(output_file, index=False)
return df

View File

@@ -0,0 +1,15 @@
# src/pipeline/relative_date_trigger.py
import pandas as pd
import os
from .config_loader import config
def apply_birthday_recursion():
input_file = os.path.join(config["output_dir"], "unified_dataset.csv")
df = pd.read_csv(input_file)
# Example: 11-month pre-birthday trigger
df["birthday_trigger"] = pd.to_datetime(df["birth_date"]) - pd.DateOffset(months=1)
output_file = os.path.join(config["output_dir"], "unified_dataset_with_triggers.csv")
df.to_csv(output_file, index=False)
return df

View File

@@ -0,0 +1,18 @@
# src/pipeline/rf_scoring.py
import pandas as pd
import os
from .config_loader import config
def calculate_rf_score():
input_file = os.path.join(config["output_dir"], "unified_dataset_with_triggers.csv")
df = pd.read_csv(input_file)
# Simple RF scoring (Recency-Frequency)
now = pd.Timestamp.now(tz="UTC")
df["recency_index"] = (now - pd.to_datetime(df["timestamp"])).dt.days
df["rf_score"] = 1 / (df["recency_index"] + 1) + df.get("session_count", 0)
output_file = os.path.join(config["output_dir"], "rf_scored_dataset.csv")
df.to_csv(output_file, index=False)
return df

View File

@@ -0,0 +1,32 @@
# src/pipeline/unified_stream.py
import os
import pandas as pd
import hashlib
import logging
from .config_loader import config
os.makedirs(config["output_dir"], exist_ok=True)
os.makedirs(config["log_dir"], exist_ok=True)
logging.basicConfig(filename=os.path.join(config["log_dir"], "pipeline.log"),
level=logging.INFO)
def hash_user_id(user_id):
return hashlib.sha256(str(user_id).encode()).hexdigest()
def load_and_merge():
logging.info("Loading CRM and clickstream data...")
crm = pd.read_csv(config["crm_file"])
click = pd.read_csv(config["clickstream_file"])
logging.info("Hashing user IDs for pseudonymization...")
crm["user_id_hashed"] = crm["user_id"].apply(hash_user_id)
click["user_id_hashed"] = click["user_id"].apply(hash_user_id)
logging.info("Merging datasets...")
merged = pd.merge(crm, click, on="user_id_hashed", how="outer")
merged["timestamp"] = pd.to_datetime(merged["timestamp"], utc=True)
output_file = os.path.join(config["output_dir"], "unified_dataset.csv")
merged.to_csv(output_file, index=False)
logging.info(f"Unified dataset saved to {output_file}")
return merged

54
wiki/gdpr-audit.md Normal file
View File

@@ -0,0 +1,54 @@
# \# GDPR Adequacy Checklist
#
# \## Governance \& Breach Protocols
# \- DPO appointed and documented
# \- 72-hour breach notification process defined
#
# \## Consent \& Data Collection
# \- No pre-ticked marketing consent boxes
# \- Explicit consent required for birthday listings
#
# \## Sensitive Data Handling
# \- Medical data restricted to "unwell" status
# \- Dietary data shared only with explicit consent
#
# \## International Data Transfers
# \- Photo/video sharing audited for adequacy mechanisms
#
## Data Protection Officer (DPO)
Name: [Your Name Here]
Role: [Your Role Here]
Contact: [email@example.com]
Date Appointed: [YYYY-MM-DD]
## Data Protection Officer (DPO)
Name:
Role:
Contact:
Date Appointed:
## Regulatory Feature Specifications
### Right of Access (Article 15)
Users can request a downloadable summary of stored personal data.
### Right of Erasure (Article 17)
Users can trigger full deletion across all systems and subprocessors.