From bf80585e2b3cb7ba4abe474af06bb49e7259f94c Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Tue, 2 Sep 2025 17:55:12 +0100 Subject: Add is_genotype_metadata_column. Promote genotype_reserved_column_name_p from helpers.strategies to is_genotype_metadata_column in pyhegp.serialization, and use it everywhere. --- pyhegp/serialization.py | 5 ++++- tests/helpers/strategies.py | 9 +++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pyhegp/serialization.py b/pyhegp/serialization.py index c86d216..ba2cb0f 100644 --- a/pyhegp/serialization.py +++ b/pyhegp/serialization.py @@ -78,13 +78,16 @@ def read_tsv(file, dtype): # data file. skip_blank_lines=False) +def is_genotype_metadata_column(name): + return name.lower() in {"chromosome", "position", "reference"} + def read_genotype(file): df = read_tsv(file, {"chromosome": "str", "position": "int", "reference": "str"}) sample_columns = [column for column in df.columns - if column not in ["chromosome", "position", "reference"]] + if not is_genotype_metadata_column(column)] df.chromosome = df.chromosome.astype("str") df.position = df.position.astype("int") if "reference" in df: diff --git a/tests/helpers/strategies.py b/tests/helpers/strategies.py index 7edf667..00c4c11 100644 --- a/tests/helpers/strategies.py +++ b/tests/helpers/strategies.py @@ -19,7 +19,7 @@ from hypothesis import strategies as st from hypothesis.extra.pandas import column, columns, data_frames -from pyhegp.serialization import Summary +from pyhegp.serialization import Summary, is_genotype_metadata_column from pyhegp.utils import negate tabless_printable_ascii_text = st.text( @@ -44,11 +44,8 @@ reference_column = column(name="reference", include_characters=("A", "G", "C", "T")), min_size=1)) -def genotype_reserved_column_name_p(name): - return name.lower() in {"chromosome", "position", "reference"} - sample_names = st.lists(tabless_printable_ascii_text - .filter(negate(genotype_reserved_column_name_p)), + .filter(negate(is_genotype_metadata_column)), unique=True) @st.composite @@ -70,7 +67,7 @@ def genotype_frames(draw): dtype="float64", elements=st.floats(allow_nan=False))))) return genotype.drop_duplicates(subset=list( - filter(genotype_reserved_column_name_p, + filter(is_genotype_metadata_column, genotype.columns)), ignore_index=True) -- cgit 1.4.1