diff options
author | Arun Isaac | 2025-09-02 17:55:12 +0100 |
---|---|---|
committer | Arun Isaac | 2025-09-02 22:32:03 +0100 |
commit | bf80585e2b3cb7ba4abe474af06bb49e7259f94c (patch) | |
tree | 9c8c25c57bd60fd226e808fadcbed0f3620d561a | |
parent | 3f3dd13f75ab91862c9e0cbd5e65f1da1e26cf4b (diff) | |
download | pyhegp-bf80585e2b3cb7ba4abe474af06bb49e7259f94c.tar.gz pyhegp-bf80585e2b3cb7ba4abe474af06bb49e7259f94c.tar.lz pyhegp-bf80585e2b3cb7ba4abe474af06bb49e7259f94c.zip |
Add is_genotype_metadata_column.
Promote genotype_reserved_column_name_p from helpers.strategies to is_genotype_metadata_column in pyhegp.serialization, and use it everywhere.
-rw-r--r-- | pyhegp/serialization.py | 5 | ||||
-rw-r--r-- | tests/helpers/strategies.py | 9 |
2 files changed, 7 insertions, 7 deletions
diff --git a/pyhegp/serialization.py b/pyhegp/serialization.py index c86d216..ba2cb0f 100644 --- a/pyhegp/serialization.py +++ b/pyhegp/serialization.py @@ -78,13 +78,16 @@ def read_tsv(file, dtype): # data file. skip_blank_lines=False) +def is_genotype_metadata_column(name): + return name.lower() in {"chromosome", "position", "reference"} + def read_genotype(file): df = read_tsv(file, {"chromosome": "str", "position": "int", "reference": "str"}) sample_columns = [column for column in df.columns - if column not in ["chromosome", "position", "reference"]] + if not is_genotype_metadata_column(column)] df.chromosome = df.chromosome.astype("str") df.position = df.position.astype("int") if "reference" in df: diff --git a/tests/helpers/strategies.py b/tests/helpers/strategies.py index 7edf667..00c4c11 100644 --- a/tests/helpers/strategies.py +++ b/tests/helpers/strategies.py @@ -19,7 +19,7 @@ from hypothesis import strategies as st from hypothesis.extra.pandas import column, columns, data_frames -from pyhegp.serialization import Summary +from pyhegp.serialization import Summary, is_genotype_metadata_column from pyhegp.utils import negate tabless_printable_ascii_text = st.text( @@ -44,11 +44,8 @@ reference_column = column(name="reference", include_characters=("A", "G", "C", "T")), min_size=1)) -def genotype_reserved_column_name_p(name): - return name.lower() in {"chromosome", "position", "reference"} - sample_names = st.lists(tabless_printable_ascii_text - .filter(negate(genotype_reserved_column_name_p)), + .filter(negate(is_genotype_metadata_column)), unique=True) @st.composite @@ -70,7 +67,7 @@ def genotype_frames(draw): dtype="float64", elements=st.floats(allow_nan=False))))) return genotype.drop_duplicates(subset=list( - filter(genotype_reserved_column_name_p, + filter(is_genotype_metadata_column, genotype.columns)), ignore_index=True) |