diff options
author | Arun Isaac | 2025-09-02 18:01:07 +0100 |
---|---|---|
committer | Arun Isaac | 2025-09-02 22:32:03 +0100 |
commit | a64055f237800b4e1427f566ebdb6c3875f1b3ac (patch) | |
tree | 9ad881c62afb7f9fef5125d7f4e2ecffc50991b7 | |
parent | d91f5403c040d23f278844dd2f2191fe07504411 (diff) | |
download | pyhegp-a64055f237800b4e1427f566ebdb6c3875f1b3ac.tar.gz pyhegp-a64055f237800b4e1427f566ebdb6c3875f1b3ac.tar.lz pyhegp-a64055f237800b4e1427f566ebdb6c3875f1b3ac.zip |
Merge, not concat, genotype frames.
pd.concat duplicates the metadata columns, and is generally the wrong approach to the problem.
-rw-r--r-- | pyhegp/pyhegp.py | 10 | ||||
-rw-r--r-- | tests/test_pyhegp.py | 2 |
2 files changed, 8 insertions, 4 deletions
diff --git a/pyhegp/pyhegp.py b/pyhegp/pyhegp.py index f204a36..98e904c 100644 --- a/pyhegp/pyhegp.py +++ b/pyhegp/pyhegp.py @@ -26,7 +26,7 @@ import numpy as np import pandas as pd from scipy.stats import special_ortho_group -from pyhegp.serialization import Summary, read_summary, write_summary, read_genotype, write_genotype, write_key +from pyhegp.serialization import Summary, read_summary, write_summary, read_genotype, write_genotype, write_key, is_genotype_metadata_column Stats = namedtuple("Stats", "n mean std") @@ -111,6 +111,12 @@ def encrypt_genotype(genotype, key, summary): axis="columns") def cat_genotype(genotypes): + def cat2(genotype1, genotype2): + return pd.merge(genotype1, genotype2, + how="inner", + on=list(filter(is_genotype_metadata_column, + genotype1.columns))) + match genotypes: # If there are no input data frames, return an empty data # frame with the chromosome and position columns. @@ -121,7 +127,7 @@ def cat_genotype(genotypes): genotype.position = genotype.position.astype("int") return genotype case _: - return pd.concat(genotypes) + return reduce(cat2, genotypes) @click.group() def main(): diff --git a/tests/test_pyhegp.py b/tests/test_pyhegp.py index 7989063..88f13ef 100644 --- a/tests/test_pyhegp.py +++ b/tests/test_pyhegp.py @@ -26,7 +26,6 @@ from hypothesis import given, settings, strategies as st from hypothesis.extra.numpy import arrays, array_shapes import numpy as np import pandas as pd -import pytest from pytest import approx from pyhegp.pyhegp import Stats, main, hegp_encrypt, hegp_decrypt, random_key, pool_stats, standardize, unstandardize, cat_genotype @@ -166,7 +165,6 @@ def catenable_genotype_frames(draw): for start, end in pairwise([0] + split_points + [len(sample_names)])] -@pytest.mark.xfail @given(catenable_genotype_frames()) def test_cat_genotype(genotypes): def metadata_columns(genotype): |