diff --git a/README.md b/README.md index 621db59..f1a80ac 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,362 @@ # birthdaymessaging + +Overview + +The Birthday Messaging Intelligence Platform is a data-driven engagement system designed to transform static birthday campaigns into an intelligent engagement engine. + +The system integrates CRM data, behavioral interaction logs, and predictive analytics to automatically identify high-value engagement opportunities, detect churn risk, and orchestrate personalized birthday messaging campaigns. + +The platform evolves through three stages: + +Stage Description +Week 2 Data pipeline and engagement preparation +Week 3 Predictive intelligence and behavioral segmentation +Future Multi-channel automation and campaign optimization + +The end result is a system that predicts disengagement before it happens and activates personalized engagement campaigns. + +System Architecture + +The project is organized into modular components following a data pipeline → intelligence → automation architecture. + +CRM Data + Clickstream Logs + │ + ▼ + Unified Data Pipeline + │ + ▼ + Data Cleaning & Compliance + │ + ▼ + Engagement Intelligence + │ + ▼ + Churn Prediction + │ + ▼ + Predictive Segmentation + │ + ▼ + Automation Triggers +Project Structure +birthdaymessaging/ +│ +├── config.yaml +├── main.py +├── README.md +│ +├── data/ +│ ├── raw/ +│ └── processed/ +│ +├── docs/ +│ ├── ethics.md +│ └── model-audits/ +│ +├── src/ +│ ├── pipeline/ # Week 2 Data Engineering +│ │ ├── unified_stream.py +│ │ ├── relative_date_trigger.py +│ │ ├── rf_scoring.py +│ │ ├── list_hygiene.py +│ │ ├── ethics_flags.py +│ │ ├── deliverability_check.py +│ │ └── config_loader.py +│ │ +│ ├── models/ # Week 3 Intelligence Layer +│ │ ├── engagement_scoring.py +│ │ ├── churn_classifier.py +│ │ └── send_time_optimizer.py +│ │ +│ ├── segmentation/ +│ │ └── predictive_segments.py +│ │ +│ └── automation/ +│ ├── trigger_engine.py +│ └── reactivation_triggers.yaml +Week 2: Data Engineering Pipeline + +Week 2 focuses on building a clean, compliant data foundation. + +Unified Data Stream + +File: + +src/pipeline/unified_stream.py + +Functions: + +Load CRM dataset + +Load clickstream activity logs + +Pseudonymize user identifiers + +Merge datasets + +Normalize timestamps (UTC) + +Output: + +unified_dataset.csv +Birthday Recursion Trigger + +File: + +relative_date_trigger.py + +Implements recurring campaign triggers such as: + +Birthday approaching + +Pre-birthday reminder + +Post-birthday follow-up + +Ensures contacts re-enter the workflow annually. + +Recency-Frequency Scoring + +File: + +rf_scoring.py + +Calculates engagement indicators: + +Recency index + +Frequency of sessions + +Session change delta + +Used as an early engagement signal. + +List Hygiene Automation + +File: + +list_hygiene.py + +Improves deliverability by removing: + +Invalid email domains + +Disposable email services + +Role-based addresses + +Ethics and Compliance Layer + +File: + +ethics_flags.py + +Detects sensitive data usage: + +medical_condition + +political_opinion + +dietary_restrictions + +Ensures compliance with GDPR principles. + +Deliverability Simulation + +File: + +deliverability_check.py + +Simulates email infrastructure readiness: + +SPF validation + +DKIM compatibility + +DMARC enforcement + +inactive contact suppression + +Week 3: Engagement Intelligence Engine + +Week 3 converts the pipeline into a predictive analytics platform. + +Engagement Scoring Engine + +File: + +src/models/engagement_scoring.py + +Calculates a normalized engagement score (0–100) based on: + +Recency of interaction + +Session frequency + +Session duration + +Click engagement depth + +Formula: + +Engagement Score = +(Recency Weight × Normalized Recency) ++ (Frequency Weight × Session Count) ++ (Engagement Depth Weight × Interaction Index) + +Output classification: + +Score Tier +75–100 Highly Engaged +50–74 Stable +25–49 Declining +0–24 At Risk +Churn Risk Classification + +File: + +src/models/churn_classifier.py + +Defines churn risk using behavioral signals. + +Conditions: + +No login ≥ X days +AND +Engagement score drop ≥ Y% + +Risk categories: + +Risk Level Description +Low Healthy engagement +Medium Engagement declining +High High probability of churn + +Output fields: + +churn_risk_flag +churn_probability_score +risk_class +Predictive Segmentation + +File: + +src/segmentation/predictive_segments.py + +Creates behavioral marketing segments using: + +Lifecycle Stage +Engagement Tier +Birthday Proximity +Churn Risk Overlay + +Example segment: + +At-Risk + Birthday Approaching +Send-Time Optimization + +File: + +src/models/send_time_optimizer.py + +Analyzes historical open patterns and determines optimal send windows. + +Outputs: + +optimal_send_hour +optimal_send_day +Automation Triggers + +File: + +src/automation/reactivation_triggers.yaml + +Defines campaign orchestration rules. + +Example: + +IF churn_risk_flag = True +AND birthday_within_30_days = True + +THEN send reminder email +WAIT 24h +IF unopened → send SMS +Model Governance & Ethics + +Documentation: + +docs/model-audits/week3-bias-review.md +docs/ethics.md + +Audits include: + +Geographic bias + +Age-based performance variance + +False-positive churn classification + +Transparency measures include clear documentation of automated decision logic. + +Running the Pipeline + +Run the full pipeline: + +python main.py + +Pipeline order: + +Week 2: +Data Merge +Birthday Triggers +RF Scoring +List Hygiene +Ethics Checks +Deliverability Simulation + +Week 3: +Engagement Scoring +Churn Classification +Predictive Segmentation +Send-Time Optimization + +Final dataset: + +data/processed/final_intelligence_output.csv +Success Criteria + +The system is considered operational when: + +Engagement scores generated for all active users + +Churn risk classification validated + +Predictive segments documented + +Reactivation triggers tested + +Send-time optimization operational + +Ethics documentation updated + +Bias audit completed + +Outcome + +At completion, the platform functions as an Engagement Intelligence Engine. + +Capabilities include: + +Predicting disengagement + +Prioritizing high-value users + +Automating reactivation campaigns + +Personalizing birthday marketing + +Ensuring ethical data usage + +The platform transforms traditional birthday messaging into a behaviorally intelligent engagement strategy. + diff --git a/config.yaml b/config.yaml index 680c808..e83f070 100644 --- a/config.yaml +++ b/config.yaml @@ -2,3 +2,12 @@ crm_file: "sample_data/crm.csv" clickstream_file: "sample_data/clickstream.csv" output_dir: "output" log_dir: "logs" +reactivation_trigger: + conditions: + churn_risk_flag: true + birthday_within_30_days: true + actions: + - send_email: emotional_reminder + - wait: 24h + - if_unopened: + send_sms: follow_up \ No newline at end of file diff --git a/logs/pipeline.log b/logs/pipeline.log index 1cb4cb9..7aa3eb8 100644 --- a/logs/pipeline.log +++ b/logs/pipeline.log @@ -62,3 +62,59 @@ INFO:root:Loading CRM and clickstream data... INFO:root:Hashing user IDs for pseudonymization... INFO:root:Merging datasets... INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv +INFO:root:Loading CRM and clickstream data... +INFO:root:Hashing user IDs for pseudonymization... +INFO:root:Merging datasets... +INFO:root:Unified dataset saved to output\unified_dataset.csv diff --git a/main.py b/main.py index c4f3696..6eab459 100644 --- a/main.py +++ b/main.py @@ -6,9 +6,13 @@ from src.pipeline.list_hygiene import apply_list_hygiene from src.pipeline.ethics_flags import apply_ethics_checks from src.pipeline.deliverability_check import run_deliverability_check -print("=== WEEK 2 PIPELINE START ===") +from src.models.engagement_scoring import calculate_engagement_scores +from src.models.churn_classifier import classify_churn +from src.models.send_time_optimizer import optimize_send_time +from src.models.predictive_segments import build_predictive_segments -load_and_merge() +print("=== WEEK 2 PIPELINE START ===") +df= load_and_merge() apply_birthday_recursion() calculate_rf_score() apply_list_hygiene() @@ -17,3 +21,12 @@ apply_ethics_checks() run_deliverability_check() print("=== WEEK 2 PIPELINE COMPLETE ===") + +print("=== WEEK 3 START ===") + +df = calculate_engagement_scores(df) +df = classify_churn(df) +df = optimize_send_time(df) +df=build_predictive_segments(df) + +print("=== WEEK 3 COMPLETE ===") diff --git a/output/rf_scored_dataset.csv b/output/rf_scored_dataset.csv index 154cba3..c403522 100644 --- a/output/rf_scored_dataset.csv +++ b/output/rf_scored_dataset.csv @@ -1,10 +1,10 @@ user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score -4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.034482758620689655 -3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.02040816326530612 -1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.02564102564102564 -1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.045454545454545456 -7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.006211180124223602 -2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.015384615384615385 -2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.07142857142857142 -6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04 -5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388345 +4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,42,0.023255813953488372 +3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,62,0.015873015873015872 +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,52,0.018867924528301886 +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,35,0.027777777777777776 +7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,174,0.005714285714285714 +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,78,0.012658227848101266 +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,26,0.037037037037037035 +6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,37,0.02631578947368421 +5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,219,0.004545454545454545 diff --git a/output/unified_dataset_cleaned.csv b/output/unified_dataset_cleaned.csv index 42d281b..e8d89dd 100644 --- a/output/unified_dataset_cleaned.csv +++ b/output/unified_dataset_cleaned.csv @@ -1,10 +1,10 @@ user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score -4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.0344827586206896 -3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.0204081632653061 -1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.0256410256410256 -1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.0454545454545454 -7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.0062111801242236 -2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.0153846153846153 -2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.0714285714285714 -6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04 -5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388 +4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,42,0.0232558139534883 +3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,62,0.0158730158730158 +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,52,0.0188679245283018 +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,35,0.0277777777777777 +7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,174,0.0057142857142857 +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,78,0.0126582278481012 +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,26,0.037037037037037 +6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,37,0.0263157894736842 +5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,219,0.0045454545454545 diff --git a/output/unified_dataset_deliverable.csv b/output/unified_dataset_deliverable.csv index 90771ec..c6884aa 100644 --- a/output/unified_dataset_deliverable.csv +++ b/output/unified_dataset_deliverable.csv @@ -1,10 +1,10 @@ user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score,deliverable -4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.0344827586206896,True -3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.0204081632653061,True -1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.0256410256410256,True -1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.0454545454545454,True -7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.0062111801242236,True -2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.0153846153846153,True -2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.0714285714285714,True -6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04,True -5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388,True +4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,42,0.0232558139534883,True +3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,62,0.0158730158730158,True +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,52,0.0188679245283018,True +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,35,0.0277777777777777,True +7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,174,0.0057142857142857,True +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,78,0.0126582278481012,True +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,26,0.037037037037037,True +6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,37,0.0263157894736842,True +5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,219,0.0045454545454545,True diff --git a/output/unified_dataset_ethics.csv b/output/unified_dataset_ethics.csv index abbfeac..a9e0d55 100644 --- a/output/unified_dataset_ethics.csv +++ b/output/unified_dataset_ethics.csv @@ -1,10 +1,10 @@ user_id_x,email,birth_date,gender,consent_flag,user_id_hashed,user_id_y,timestamp,page_depth,dwell_time,session_id,birthday_trigger,recency_index,rf_score,sensitive_flag -4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,28,0.0344827586206896,False -3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,48,0.0204081632653061,False -1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,38,0.0256410256410256,False -1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,21,0.0454545454545454,False -7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,160,0.0062111801242236,False -2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,64,0.0153846153846153,False -2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,13,0.0714285714285714,False -6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,24,0.04,False -5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,205,0.0048543689320388,False +4,dana@outlook.com,1992-05-30,F,True,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a,4,2026-01-25 16:45:00+00:00,3,60,401,1992-04-30,42,0.0232558139534883,False +3,charlie@tempmail.com,2000-11-21,M,False,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce,3,2026-01-05 12:30:00+00:00,2,30,301,2000-10-21,62,0.0158730158730158,False +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-01-15 09:15:00+00:00,5,120,101,1990-01-17,52,0.0188679245283018,False +1,alice@gmail.com,1990-02-17,F,True,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b,1,2026-02-01 10:20:00+00:00,3,90,102,1990-01-17,35,0.0277777777777777,False +7,grace@no-reply.com,1993-03-15,F,True,7902699be42c8a8e46fbbb4501726517e86b22c56a189f7625a6da49081b2451,7,2025-09-15 17:10:00+00:00,2,45,701,1993-02-15,174,0.0057142857142857,False +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2025-12-20 14:10:00+00:00,4,60,201,1985-06-05,78,0.0126582278481012,False +2,bob@yahoo.com,1985-07-05,M,True,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35,2,2026-02-10 08:50:00+00:00,6,180,202,1985-06-05,26,0.037037037037037,False +6,frank@gmail.com,1995-12-01,M,True,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a5ffcfb81e6f0919683,6,2026-01-30 09:00:00+00:00,4,120,601,1995-11-01,37,0.0263157894736842,False +5,eric@mailinator.com,1988-09-10,M,True,ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d,5,2025-08-01 11:00:00+00:00,5,150,501,1988-08-10,219,0.0045454545454545,False diff --git a/src/models/__pycache__/churn_classifier.cpython-312.pyc b/src/models/__pycache__/churn_classifier.cpython-312.pyc new file mode 100644 index 0000000..9eed1d1 Binary files /dev/null and b/src/models/__pycache__/churn_classifier.cpython-312.pyc differ diff --git a/src/models/__pycache__/engagement_scoring.cpython-312.pyc b/src/models/__pycache__/engagement_scoring.cpython-312.pyc new file mode 100644 index 0000000..a4e2fbe Binary files /dev/null and b/src/models/__pycache__/engagement_scoring.cpython-312.pyc differ diff --git a/src/models/__pycache__/predictive_segments.cpython-312.pyc b/src/models/__pycache__/predictive_segments.cpython-312.pyc new file mode 100644 index 0000000..f7d0965 Binary files /dev/null and b/src/models/__pycache__/predictive_segments.cpython-312.pyc differ diff --git a/src/models/__pycache__/send_time_optimizer.cpython-312.pyc b/src/models/__pycache__/send_time_optimizer.cpython-312.pyc new file mode 100644 index 0000000..e06c193 Binary files /dev/null and b/src/models/__pycache__/send_time_optimizer.cpython-312.pyc differ diff --git a/src/models/churn_classifier.py b/src/models/churn_classifier.py new file mode 100644 index 0000000..8923447 --- /dev/null +++ b/src/models/churn_classifier.py @@ -0,0 +1,30 @@ +import numpy as np + +def classify_churn(df): + + inactivity_threshold = 30 + drop_threshold = 20 + + df["engagement_drop"] = df["engagement_score"].diff().fillna(0) + + df["churn_risk_flag"] = ( + (df["days_since_last_session"] >= inactivity_threshold) & + (df["engagement_drop"] <= -drop_threshold) + ) + + def risk_class(row): + if row["churn_risk_flag"]: + return "High Risk" + elif row["engagement_score"] < 50: + return "Medium Risk" + else: + return "Low Risk" + + df["risk_class"] = df.apply(risk_class, axis=1) + + df["churn_probability_score"] = np.where( + df["risk_class"] == "High Risk", 0.8, + np.where(df["risk_class"] == "Medium Risk", 0.5, 0.2) + ) + + return df \ No newline at end of file diff --git a/src/models/engagement_scoring.py b/src/models/engagement_scoring.py new file mode 100644 index 0000000..3177c57 --- /dev/null +++ b/src/models/engagement_scoring.py @@ -0,0 +1,46 @@ + +import pandas as pd +import numpy as np + +def calculate_engagement_scores(df): + + now = pd.Timestamp.now(tz="UTC") + + df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True, errors="coerce") + + df["days_since_last_session"] = (now - df["timestamp"]).dt.days + df["session_count"] = df.groupby("user_id_hashed")["timestamp"].transform("count") + + df["avg_session_duration"] = df.get("session_duration", 1) + df["click_engagement"] = df.get("click_through_rate", 0.5) + + def normalize(series): + return (series - series.min()) / (series.max() - series.min() + 1e-9) + + recency_norm = 1 - normalize(df["days_since_last_session"]) + frequency_norm = normalize(df["session_count"]) + duration_norm = normalize(df["avg_session_duration"]) + click_norm = normalize(df["click_engagement"]) + + score = ( + 0.35 * recency_norm + + 0.30 * frequency_norm + + 0.20 * duration_norm + + 0.15 * click_norm + ) + + df["engagement_score"] = (score * 100).round(2) + + def classify(score): + if score >= 75: + return "Tier 1 - Highly Engaged" + elif score >= 50: + return "Tier 2 - Stable" + elif score >= 25: + return "Tier 3 - Declining" + else: + return "Tier 4 - At Risk" + + df["engagement_tier"] = df["engagement_score"].apply(classify) + + return df \ No newline at end of file diff --git a/src/models/predictive_segments.py b/src/models/predictive_segments.py new file mode 100644 index 0000000..9b1820c --- /dev/null +++ b/src/models/predictive_segments.py @@ -0,0 +1,21 @@ +import pandas as pd + +def build_predictive_segments(df): + + # Default segment + df["predictive_segment"] = "General" + + # At-risk + birthday approaching + df.loc[ + (df.get("risk_class") == "High Risk") & + (df.get("birthday_within_30_days", False) == True), + "predictive_segment" + ] = "At-Risk + Birthday Approaching" + + # High-value users + df.loc[ + (df.get("engagement_tier") == "Tier 1 - Highly Engaged"), + "predictive_segment" + ] = "High Value - Amplify" + + return df \ No newline at end of file diff --git a/src/models/send_time_optimizer.py b/src/models/send_time_optimizer.py new file mode 100644 index 0000000..7263c3d --- /dev/null +++ b/src/models/send_time_optimizer.py @@ -0,0 +1,21 @@ +import pandas as pd + +def optimize_send_time(df): + + # Safety check + required_cols = ["timestamp", "click_engagement"] + for col in required_cols: + if col not in df.columns: + raise ValueError(f"{col} missing for send-time optimization") + + df["hour"] = pd.to_datetime(df["timestamp"], utc=True).dt.hour + + optimal_hours = ( + df.groupby("hour")["click_engagement"] + .mean() + .idxmax() + ) + + df["optimal_send_hour"] = optimal_hours + + return df \ No newline at end of file