# alx_heor > Multi-source HEOR analytics library for healthcare claims data (IQVIA, Optum, Komodo). Build patient cohorts, analyze enrollment, and study treatment patterns. **Version**: 0.5.0 ## Quick Start ```python from alx_heor import RedshiftConnection, get_cohort from alx_heor.cohort import CohortCriteria, DiagnosisCriteria conn = RedshiftConnection().connect() criteria = CohortCriteria( primary_diagnosis=DiagnosisCriteria( codes=['G700', 'G7000', 'G7001'], # gMG ICD-10 codes (no dots) min_count=2, days_apart=30, label="gMG ≥2 Dx, 30 days apart", ), min_age=18, exclude_specialties=['OPHTHAL', 'OPTOMTRY'], ) result = get_cohort( conn, source='iqvia', schema='iqvia_pharmetrics_2024q3', criteria=criteria, start_year=2015, end_year=2024, ) print(result.summary()) df_cohort = result.df_cohort ``` ## Supported Data Sources | Source | Database | Patient ID Column | Service Date Column | |--------|----------|-------------------|---------------------| | iqvia | Redshift | pat_id | from_dt | | optum | Redshift | patid | svcdate | | komodo | Snowflake | patient_id | service_date | ## API Reference ### Database Module (alx_heor.database) #### RedshiftConnection Database connection with environment-based credentials. ```python class RedshiftConnection: def __init__( self, host: str | None = None, # Falls back to REDSHIFT_HOST env var database: str | None = None, # Falls back to REDSHIFT_DATABASE env var user: str | None = None, # Falls back to REDSHIFT_USER env var password: str | None = None, # Falls back to REDSHIFT_PASSWORD env var ) -> None def connect(self) -> RedshiftConnection def query(self, sql: str) -> pd.DataFrame def execute(self, sql: str) -> None def get_tables(self, schema: str, keyword: str | None = None) -> list[str] def get_columns(self, schema: str, table: str, keyword: str | None = None) -> list[str] def get_schemas(self, keyword: str | None = None) -> list[str] def close(self) -> None def drop_table(self, schema: str, table: str) -> None def write_table(self, df: pd.DataFrame, schema: str, table: str, if_exists: str = "replace", chunksize: int = 5000) -> int def write_from_select(self, sql: str, schema: str, table: str, overwrite: bool = True) -> None @property def is_connected(self) -> bool ``` **Usage:** ```python # Context manager (recommended) with RedshiftConnection().connect() as conn: df = conn.query("SELECT * FROM schema.table LIMIT 10") # Manual connection conn = RedshiftConnection().connect() df = conn.query("SELECT * FROM schema.table") conn.close() ``` #### format_in_clause ```python def format_in_clause(items: list) -> str ``` Format a list for SQL IN clause. Returns comma-separated quoted string. ```python >>> format_in_clause(['G35', 'G36', 'G37']) "'G35', 'G36', 'G37'" ``` --- ### Config Module (alx_heor.config) #### get_source_config ```python def get_source_config(source: str) -> DataSourceConfig ``` Get configuration for a data source. Returns column mappings and table patterns. **Parameters:** - `source`: 'iqvia', 'optum', or 'komodo' (case-insensitive) **Returns:** DataSourceConfig TypedDict with keys: - `name`: Human-readable name - `claims_table_pattern`: e.g., 'claims_{year}' - `enrollment_table_pattern`: e.g., 'enroll2_{year}' - `rx_lookup_table`: NDC lookup table name - `pr_lookup_table`: Procedure lookup table name - `columns`: ColumnMapping with patient_id, service_date, diagnosis, etc. - `default_claims_columns`: Default columns for claims queries ```python >>> config = get_source_config('iqvia') >>> config['columns']['patient_id'] 'pat_id' >>> config['columns']['diagnosis'] ['diag1', 'diag2', ..., 'diag12', 'diag_admit'] ``` #### list_sources ```python def list_sources() -> list[str] ``` Returns: `['iqvia', 'komodo', 'optum']` --- ### Claims Module (alx_heor.claims) #### get_claims ```python def get_claims( conn: RedshiftConnection, source: str, schema: str, diagnosis_codes: list[str], start_year: int, end_year: int, table_pattern: str | None = None, columns: list[str] | None = None, diagnosis_columns: list[str] | None = None, include_demographics: bool = False, ) -> pd.DataFrame ``` Query claims filtered by diagnosis codes. Builds UNION ALL across yearly tables. **Parameters:** - `conn`: Active RedshiftConnection - `source`: 'iqvia', 'optum', 'komodo' - `schema`: Database schema name - `diagnosis_codes`: ICD codes to filter (exact match, no dots: 'G700' not 'G70.0') - `start_year`, `end_year`: Year range for claims tables - `include_demographics`: If True, adds der_yob/der_sex from enrollment (IQVIA) **Returns:** DataFrame with one row per claim #### build_claims_sql ```python def build_claims_sql( source: str, schema: str, diagnosis_codes: list[str], start_year: int, end_year: int, table_pattern: str | None = None, columns: list[str] | None = None, diagnosis_columns: list[str] | None = None, ) -> str ``` Preview SQL without executing. Returns SQL string. #### get_index_dates ```python def get_index_dates( df_claims: pd.DataFrame, source: str | None = None, patient_id_col: str | None = None, date_col: str | None = None, min_diagnosis_count: int = 1, days_apart: int = 30, ) -> pd.DataFrame ``` Identify index dates from claims. **Parameters:** - `df_claims`: Output from get_claims() - `source`: For column name defaults - `min_diagnosis_count`: 1 = any diagnosis, 2 = standard RWE criterion - `days_apart`: Minimum days between first and last diagnosis (when min_count >= 2) **Returns:** DataFrame with columns: `patient_id`, `index_date` #### add_demographics ```python def add_demographics( df_index: pd.DataFrame, df_claims: pd.DataFrame | None = None, df_demographics: pd.DataFrame | None = None, source: str | None = None, patient_id_col: str | None = None, yob_col: str | None = None, sex_col: str | None = None, ) -> pd.DataFrame ``` Add age_at_index, year_of_birth, sex to cohort. **Note for IQVIA:** Demographics are in `enroll` table, not claims. Use `get_claims(..., include_demographics=True)`. #### filter_demographics ```python def filter_demographics( df: pd.DataFrame, min_age: int | None = None, max_age: int | None = None, valid_sex_only: bool = True, age_col: str = "age_at_index", sex_col: str = "sex", ) -> pd.DataFrame ``` Filter by age/sex criteria. Returns copy (non-destructive). --- ### Cohort Module (alx_heor.cohort) #### DiagnosisCriteria ```python @dataclass class DiagnosisCriteria: codes: list[str] # ICD codes (no dots) min_count: int = 1 # Minimum diagnoses required days_apart: int = 0 # Minimum days between first/last window_start: int | None = None # Days relative to index (negative = before) window_end: int | None = None # Days relative to index diagnosis_position: Literal["any", "primary", "admit"] = "any" require_inpatient: bool = False require_outpatient: bool = False label: str = "" # For attrition table ``` **Examples:** ```python # Primary inclusion: 2+ gMG diagnoses 30 days apart DiagnosisCriteria( codes=["G700", "G7000", "G7001"], min_count=2, days_apart=30, label="gMG ≥2 Dx, 30 days apart", ) # Baseline exclusion: malignancy in year before index DiagnosisCriteria( codes=["C00", "C01"], window_start=-365, window_end=-1, label="Malignancy in baseline", ) ``` #### ProcedureCriteria ```python @dataclass class ProcedureCriteria: codes: list[str] # CPT/HCPCS codes min_count: int = 1 window_start: int | None = None window_end: int | None = None label: str = "" ``` #### MedicationCriteria ```python @dataclass class MedicationCriteria: generic_names: list[str] | None = None # e.g., ['eculizumab', 'ravulizumab'] ndc_codes: list[str] | None = None procedure_codes: list[str] | None = None # J-codes min_count: int = 1 window_start: int | None = None window_end: int | None = None label: str = "" ``` At least one of generic_names, ndc_codes, or procedure_codes required. #### EnrollmentCriteria ```python @dataclass class EnrollmentCriteria: months_before: int = 0 # Required baseline enrollment months_after: int = 0 # Required follow-up enrollment max_gap_months: int = 1 # Maximum allowed gap label: str = "" ``` #### CohortCriteria ```python @dataclass class CohortCriteria: # Inclusion primary_diagnosis: DiagnosisCriteria required_diagnoses: list[DiagnosisCriteria] = field(default_factory=list) required_procedures: list[ProcedureCriteria] = field(default_factory=list) required_medications: list[MedicationCriteria] = field(default_factory=list) # Exclusion excluded_diagnoses: list[DiagnosisCriteria] = field(default_factory=list) excluded_procedures: list[ProcedureCriteria] = field(default_factory=list) excluded_medications: list[MedicationCriteria] = field(default_factory=list) # Demographics min_age: int | None = 18 max_age: int | None = None valid_sex_only: bool = True # Specialty exclude_specialties: list[str] | None = None require_specialty_confirmation: bool = False # Enrollment enrollment: EnrollmentCriteria | None = None # Index date index_date_method: Literal["first_dx", "second_dx", "first_rx"] = "first_dx" ``` #### get_cohort ```python def get_cohort( conn: RedshiftConnection, source: str, schema: str, criteria: CohortCriteria, start_year: int, end_year: int, study_start: str | None = None, # e.g., '2015-01-01' study_end: str | None = None, # e.g., '2024-03-31' include_claims: bool = True, ) -> CohortResult ``` Build cohort with full attrition tracking. This is the main high-level function. **Returns:** CohortResult with: - `df_cohort`: Final cohort (one row per patient) - `df_claims`: All diagnosis claims - `attrition`: Dict of step → count - `criteria`: The CohortCriteria used - `df_enrollment`, `df_censor`, `df_payer`: If enrollment criteria applied - `summary()`: Method returning formatted attrition table **Example:** ```python result = get_cohort(conn, source='iqvia', schema='...', criteria=criteria, start_year=2015, end_year=2024) print(result.summary()) df = result.df_cohort ``` --- ### Enrollment Module (alx_heor.enrollment) #### get_enrollment ```python def get_enrollment( conn: RedshiftConnection, source: str, schema: str, patient_ids: list[str], start_year: int, end_year: int, columns: list[str] | None = None, ) -> pd.DataFrame ``` Fetch enrollment data. Returns DataFrame with patient_id and month_id (YYYYMM format). #### calculate_enrollment_gaps ```python def calculate_enrollment_gaps( df_enrollment: pd.DataFrame, df_index: pd.DataFrame, patient_id_col: str = "pat_id", index_date_col: str = "index_date", months_pre: int = 6, months_post: int = 12, ) -> pd.DataFrame ``` Calculate gaps relative to index date. Returns DataFrame with `max_gap_months` column. #### filter_continuous_enrollment ```python def filter_continuous_enrollment( df_gaps: pd.DataFrame, max_gap: int = 1, patient_id_col: str = "pat_id", ) -> pd.DataFrame ``` Filter to patients where `max_gap_months <= max_gap`. #### get_censor_dates ```python def get_censor_dates( df_gaps: pd.DataFrame, study_end: str, # e.g., '2024-03-31' max_gap_for_censor: int = 3, patient_id_col: str = "pat_id", index_date_col: str = "index_date", ) -> pd.DataFrame ``` Calculate censoring dates for survival analysis. **Returns:** DataFrame with: - `censor_date`: Date patient is censored - `is_censored_by_gap`: True if gap-censored, False if study-end censored - `days_to_censor`: Days from index to censor #### get_max_enrollment_gap ```python def get_max_enrollment_gap( df_enrollment: pd.DataFrame, df_index: pd.DataFrame, patient_id_col: str = "pat_id", index_date_col: str = "index_date", months_pre: int = 6, months_post: int = 12, ) -> pd.DataFrame ``` Convenience function returning one row per patient with `max_gap_months`. #### analyze_enrollment ```python def analyze_enrollment( conn: RedshiftConnection, source: str, schema: str, df_cohort: pd.DataFrame, start_year: int, end_year: int, patient_id_col: str = "pat_id", index_date_col: str = "index_date", months_pre: int = 6, months_post: int = 12, max_gap_continuous: int = 1, max_gap_censor: int = 3, study_end: str | None = None, ) -> EnrollmentResult ``` Complete enrollment analysis workflow. Returns `EnrollmentResult` dataclass. --- ### Medications Module (alx_heor.medications) #### lookup_medications ```python def lookup_medications( conn: RedshiftConnection, source: str, schema: str, generic_names: list[str], exclude_keywords: list[str] | None = None, include_jcodes_only: bool = True, ) -> MedicationLookupResult ``` Look up NDC codes and J-codes by generic name. **Returns:** `MedicationLookupResult` with: - `df_ndc`: DataFrame of NDC codes - `df_jcodes`: DataFrame of J-codes - `ndc_list`: List for SQL IN clause - `jcode_list`: List for SQL IN clause - `summary()`: Human-readable summary **Example:** ```python result = lookup_medications( conn, source='iqvia', schema='...', generic_names=['eculizumab', 'ravulizumab', 'efgartigimod'], exclude_keywords=['HYALU'], # Exclude hyaluronidase formulations ) print(result.summary()) ``` #### get_treatment_claims ```python def get_treatment_claims( conn: RedshiftConnection, source: str, schema: str, patient_ids: list[str], start_year: int, end_year: int, ndc_codes: list[str] | None = None, jcodes: list[str] | None = None, columns: list[str] | None = None, ) -> pd.DataFrame ``` Query treatment claims by NDC or J-code for specified patients. #### assign_medication_category ```python def assign_medication_category( df_claims: pd.DataFrame, medication_map: dict[str, str], # {code: medication_name} ndc_column: str = "ndc", proc_column: str = "proc1", category_column: str = "medication", ) -> pd.DataFrame ``` Add medication category column based on NDC/J-code mapping. ```python med_map = {'J1300': 'eculizumab', 'J9332': 'efgartigimod', 'J1303': 'ravulizumab'} df = assign_medication_category(df_claims, med_map) ``` #### identify_treatment_episodes ```python def identify_treatment_episodes( df_claims: pd.DataFrame, patient_id_col: str = "pat_id", date_col: str = "from_dt", medication_col: str = "medication", gap_days: int = 45, ) -> pd.DataFrame ``` Group claims into treatment episodes. New episode starts after gap > gap_days. **Returns:** DataFrame with columns: - `patient_id`, `medication`, `episode_num` - `start_date`, `end_date`, `num_claims`, `duration_days` #### get_first_treatment_date ```python def get_first_treatment_date( df_claims: pd.DataFrame, patient_id_col: str = "pat_id", date_col: str = "from_dt", medication_col: str | None = None, ) -> pd.DataFrame ``` Returns DataFrame with `patient_id` and `first_treatment_date`. #### calculate_pdc ```python def calculate_pdc( df_claims: pd.DataFrame, patient_id_col: str = "pat_id", date_col: str = "from_dt", days_supply_col: str | None = None, # Defaults to 30 if None observation_window: int = 365, index_date_col: str | None = None, ) -> pd.DataFrame ``` Calculate Proportion of Days Covered (adherence metric). **Returns:** DataFrame with: - `patient_id`, `pdc` (0.0-1.0), `covered_days`, `observation_days` PDC >= 0.80 is generally considered adherent. #### classify_payer_type ```python def classify_payer_type( df_enrollment: pd.DataFrame, patient_id_col: str = "pat_id", payer_col: str = "pay_type", index_date_col: str | None = None, payer_mapping: dict[str, list[str]] | None = None, ) -> pd.DataFrame ``` Classify patients by payer type (COMMERCIAL, MEDICARE, MEDICAID, OTHER). Default IQVIA mapping: C/S=Commercial, R/T/A=Medicare, M=Medicaid. --- ## IQVIA-Specific Notes ### Patient ID Columns - **pat_id**: Primary identifier in claims (claims_YYYY) and enrollment (enroll2_YYYY). Always use for joins. - **iq_patient_id**: Secondary identifier in `enroll` table only. Legacy code joins claims to `enroll` via pat_id, then groups by iq_patient_id. ### Demographics For IQVIA, demographics (der_yob, der_sex) are in the `enroll` table, not claims. - Use `get_claims(..., include_demographics=True)` to fetch automatically - Or query: `SELECT pat_id, der_yob, der_sex FROM schema.enroll WHERE pat_id IN (...)` ### Age > 85 Handling IQVIA sets `der_yob = 0` or NULL for patients older than 85. The library handles this by setting `der_yob = study_end_year - 86`. ### ICD Code Format Use codes without dots: `'G700'` not `'G70.0'` ### Table Patterns - Claims: `claims_{year}` (claims_2006 through claims_2025) - Enrollment: `enroll2_{year}` (enroll2_2006 through enroll2_2025) - Demographics: `enroll` (single table, not yearly) ### Specialty Exclusion For gMG studies, exclude ophthalmology to avoid ocular MG: ```python exclude_specialties=['OPHTHAL', 'OPTOMTRY'] ``` --- ## Common Patterns ### Build gMG Cohort ```python from alx_heor import RedshiftConnection, get_cohort from alx_heor.cohort import CohortCriteria, DiagnosisCriteria, EnrollmentCriteria conn = RedshiftConnection().connect() criteria = CohortCriteria( primary_diagnosis=DiagnosisCriteria( codes=['G700', 'G7000', 'G7001'], min_count=2, days_apart=30, label="gMG ≥2 Dx, 30 days apart", ), min_age=18, exclude_specialties=['OPHTHAL', 'OPTOMTRY'], enrollment=EnrollmentCriteria( months_before=6, months_after=6, max_gap_months=1, ), ) result = get_cohort( conn, source='iqvia', schema='iqvia_pharmetrics_2024q3', criteria=criteria, start_year=2015, end_year=2024, study_start='2015-01-01', study_end='2023-12-31', ) print(result.summary()) df_cohort = result.df_cohort ``` ### Analyze Treatment ```python from alx_heor.medications import ( lookup_medications, get_treatment_claims, assign_medication_category, identify_treatment_episodes, ) # Find drug codes rx_result = lookup_medications( conn, source='iqvia', schema='...', generic_names=['eculizumab', 'ravulizumab', 'efgartigimod'], ) # Get treatment claims df_rx = get_treatment_claims( conn, source='iqvia', schema='...', patient_ids=df_cohort['patient_id'].tolist(), start_year=2015, end_year=2024, ndc_codes=rx_result.ndc_list, jcodes=rx_result.jcode_list, ) # Categorize and analyze med_map = { 'J1300': 'eculizumab', 'J1303': 'ravulizumab', 'J9332': 'efgartigimod', 'J9334': 'efgartigimod', } df_rx = assign_medication_category(df_rx, med_map) df_episodes = identify_treatment_episodes(df_rx, gap_days=45) ``` ### Custom SQL Query ```python conn = RedshiftConnection().connect() df = conn.query(""" SELECT pat_id, from_dt, diag1, diag2 FROM schema.claims_2024 WHERE diag1 IN ('G700', 'G7000', 'G7001') LIMIT 1000 """) ```