simace.sampling¶

Pedigree subsampling and random dropout for incomplete-observation scenarios.

sample¶

simace.sampling.sample ¶

Subsample phenotyped individuals before stats/plotting.

run_sample ¶

run_sample(phenotype, *, N_sample=0, case_ascertainment_ratio=1.0, seed=42)

Optionally subsample phenotyped individuals.

PARAMETER	DESCRIPTION
`phenotype`	DataFrame of phenotyped individuals. TYPE: `DataFrame`
`N_sample`	target sample size. `<= 0` or `>= len(phenotype)` passes everything through. TYPE: `int` DEFAULT: `0`
`case_ascertainment_ratio`	sampling weight for cases (`affected1 == True`) relative to controls. `1.0` = uniform; `0` = controls only. TYPE: `float` DEFAULT: `1.0`
`seed`	RNG seed. TYPE: `int` DEFAULT: `42`

RETURNS	DESCRIPTION
`DataFrame`	DataFrame with at most `N_sample` rows (or all rows if
`DataFrame`	`N_sample <= 0` or `N_sample >= len(phenotype)`).

Source code in simace/sampling/sample.py

@stage(reads=CENSORED, writes=CENSORED)
def run_sample(
    phenotype: pd.DataFrame,
    *,
    N_sample: int = 0,
    case_ascertainment_ratio: float = 1.0,
    seed: int = 42,
) -> pd.DataFrame:
    """Optionally subsample phenotyped individuals.

    Args:
        phenotype: DataFrame of phenotyped individuals.
        N_sample: target sample size. ``<= 0`` or ``>= len(phenotype)``
            passes everything through.
        case_ascertainment_ratio: sampling weight for cases
            (``affected1 == True``) relative to controls. ``1.0`` =
            uniform; ``0`` = controls only.
        seed: RNG seed.

    Returns:
        DataFrame with at most ``N_sample`` rows (or all rows if
        ``N_sample <= 0`` or ``N_sample >= len(phenotype)``).
    """
    n_total = len(phenotype)
    n_sample = int(N_sample)
    seed = int(seed)
    ratio = float(case_ascertainment_ratio)

    if ratio < 0:
        raise ValueError(f"case_ascertainment_ratio must be >= 0, got {ratio}")

    if n_sample <= 0 or n_sample >= n_total:
        if ratio != 1.0:
            logger.warning(
                "case_ascertainment_ratio=%.2f has no effect when N_sample=%d (all %d individuals passed through)",
                ratio,
                n_sample,
                n_total,
            )
        logger.info("Sample pass-through: keeping all %d individuals (N_sample=%d)", n_total, n_sample)
        return phenotype

    t0 = time.perf_counter()
    rng = np.random.default_rng(seed)

    if ratio == 1.0:
        indices = np.sort(rng.choice(n_total, n_sample, replace=False))
    else:
        is_case = phenotype["affected1"].values
        n_cases = int(is_case.sum())
        n_controls = n_total - n_cases

        if n_cases == 0:
            logger.warning(
                "No cases found (affected1); ignoring case_ascertainment_ratio=%.2f, falling back to uniform sampling",
                ratio,
            )
            indices = np.sort(rng.choice(n_total, n_sample, replace=False))
        elif n_cases == n_total:
            logger.warning(
                "All individuals are cases (affected1); ignoring "
                "case_ascertainment_ratio=%.2f, falling back to uniform sampling",
                ratio,
            )
            indices = np.sort(rng.choice(n_total, n_sample, replace=False))
        elif ratio == 0:
            actual_n = min(n_sample, n_controls)
            if actual_n < n_sample:
                logger.warning(
                    "case_ascertainment_ratio=0: clamping N_sample from %d to %d (only %d controls available)",
                    n_sample,
                    actual_n,
                    n_controls,
                )
            control_indices = np.where(~is_case)[0]
            indices = np.sort(rng.choice(control_indices, actual_n, replace=False))
        else:
            weights = np.where(is_case, ratio, 1.0)
            probabilities = weights / weights.sum()

            expected_case_prob = (ratio * n_cases) / (ratio * n_cases + n_controls)
            expected_cases_drawn = expected_case_prob * n_sample
            if expected_cases_drawn > 0.9 * n_cases:
                logger.warning(
                    "Extreme ascertainment: expected to draw %.0f of %d total cases (ratio=%.1f, N_sample=%d)",
                    expected_cases_drawn,
                    n_cases,
                    ratio,
                    n_sample,
                )

            indices = np.sort(rng.choice(n_total, n_sample, replace=False, p=probabilities))

    result = phenotype.iloc[indices].reset_index(drop=True)
    elapsed = time.perf_counter() - t0

    if ratio != 1.0:
        n_cases_sampled = int(result["affected1"].sum())
        logger.info(
            "Sampled %d → %d individuals (cases: %d, %.1f%%) in %.1fs (seed=%d, ratio=%.1f)",
            n_total,
            len(result),
            n_cases_sampled,
            100.0 * n_cases_sampled / len(result) if len(result) > 0 else 0,
            elapsed,
            seed,
            ratio,
        )
    else:
        logger.info("Sampled %d → %d individuals in %.1fs (seed=%d)", n_total, n_sample, elapsed, seed)
    return result

cli ¶

cli()

Command-line interface for subsampling phenotype data.

Source code in simace/sampling/sample.py

def cli() -> None:
    """Command-line interface for subsampling phenotype data."""
    from simace.core.cli_base import add_logging_args, init_logging

    parser = argparse.ArgumentParser(description="Subsample phenotyped individuals")
    add_logging_args(parser)
    parser.add_argument("--phenotype", required=True, help="Input phenotype parquet")
    parser.add_argument("--output", required=True, help="Output sampled phenotype parquet")
    parser.add_argument("--N-sample", type=int, default=0, help="Number of individuals to retain (0 = keep all)")
    parser.add_argument(
        "--case-ascertainment-ratio",
        type=float,
        default=1.0,
        help="Sampling weight for cases relative to controls (1.0 = uniform)",
    )
    parser.add_argument("--seed", type=int, default=42, help="Random seed")

    args = parser.parse_args()
    init_logging(args)

    phenotype = pd.read_parquet(args.phenotype)
    result = run_sample(
        phenotype,
        N_sample=args.N_sample,
        case_ascertainment_ratio=args.case_ascertainment_ratio,
        seed=args.seed,
    )
    save_parquet(result, args.output)

dropout¶

simace.sampling.dropout ¶

Randomly drop individuals from a pedigree to simulate incomplete observation.

run_dropout ¶

run_dropout(pedigree, *, pedigree_dropout_rate=0, seed=42)

Remove a random fraction of individuals from the pedigree.

Dropped individuals are deleted entirely. Any mother/father/twin links that reference a dropped individual are set to -1 (unknown).

PARAMETER	DESCRIPTION
`pedigree`	DataFrame with columns id, mother, father, twin, etc. TYPE: `DataFrame`
`pedigree_dropout_rate`	fraction of individuals to drop, in `[0, 1)`. TYPE: `float` DEFAULT: `0`
`seed`	RNG seed. TYPE: `int` DEFAULT: `42`

RETURNS	DESCRIPTION
`DataFrame`	DataFrame with dropped rows removed and dangling links severed.

Source code in simace/sampling/dropout.py

@stage(reads=PEDIGREE, writes=PEDIGREE)
def run_dropout(
    pedigree: pd.DataFrame,
    *,
    pedigree_dropout_rate: float = 0,
    seed: int = 42,
) -> pd.DataFrame:
    """Remove a random fraction of individuals from the pedigree.

    Dropped individuals are deleted entirely. Any mother/father/twin links
    that reference a dropped individual are set to -1 (unknown).

    Args:
        pedigree: DataFrame with columns id, mother, father, twin, etc.
        pedigree_dropout_rate: fraction of individuals to drop, in ``[0, 1)``.
        seed: RNG seed.

    Returns:
        DataFrame with dropped rows removed and dangling links severed.
    """
    rate = float(pedigree_dropout_rate)
    seed = int(seed)

    if rate <= 0:
        logger.info("Dropout pass-through: rate=%.3f, keeping all %d individuals", rate, len(pedigree))
        return pedigree

    n_total = len(pedigree)
    n_drop = round(n_total * rate)

    if n_drop <= 0:
        logger.info("Dropout pass-through: n_drop=0 for rate=%.4f, N=%d", rate, n_total)
        return pedigree

    if n_drop >= n_total:
        raise ValueError(f"Dropout rate {rate} would remove all {n_total} individuals")

    t0 = time.perf_counter()
    rng = np.random.default_rng(seed)
    drop_indices = rng.choice(n_total, n_drop, replace=False)
    drop_ids = pedigree["id"].values[drop_indices]

    keep_mask = np.ones(n_total, dtype=bool)
    keep_mask[drop_indices] = False
    result = pedigree.loc[keep_mask].copy()

    for col in ("mother", "father", "twin"):
        if col in result.columns:
            vals = result[col].values
            dangling = np.isin(vals, drop_ids) & (vals >= 0)
            if dangling.any():
                result.loc[result.index[dangling], col] = -1

    result = result.reset_index(drop=True)
    elapsed = time.perf_counter() - t0

    logger.info(
        "Dropout: %d → %d individuals (dropped %d, rate=%.3f) in %.2fs (seed=%d)",
        n_total,
        len(result),
        n_drop,
        rate,
        elapsed,
        seed,
    )
    return result

cli ¶

cli()

Command-line interface for pedigree dropout.

Source code in simace/sampling/dropout.py

def cli() -> None:
    """Command-line interface for pedigree dropout."""
    from simace.core.cli_base import add_logging_args, init_logging

    parser = argparse.ArgumentParser(description="Randomly drop individuals from a pedigree")
    add_logging_args(parser)
    parser.add_argument("--pedigree", required=True, help="Input pedigree parquet")
    parser.add_argument("--output", required=True, help="Output pedigree parquet")
    parser.add_argument("--dropout-rate", type=float, default=0, help="Fraction of individuals to drop (0-1)")
    parser.add_argument("--seed", type=int, default=42, help="Random seed")

    args = parser.parse_args()
    init_logging(args)

    pedigree = pd.read_parquet(args.pedigree)
    result = run_dropout(pedigree, pedigree_dropout_rate=args.dropout_rate, seed=args.seed)
    save_parquet(result, args.output)