about summary refs log tree commit diff
diff options
context:
space:
mode:
authorArun Isaac2025-09-02 18:01:07 +0100
committerArun Isaac2025-09-02 22:32:03 +0100
commita64055f237800b4e1427f566ebdb6c3875f1b3ac (patch)
tree9ad881c62afb7f9fef5125d7f4e2ecffc50991b7
parentd91f5403c040d23f278844dd2f2191fe07504411 (diff)
downloadpyhegp-a64055f237800b4e1427f566ebdb6c3875f1b3ac.tar.gz
pyhegp-a64055f237800b4e1427f566ebdb6c3875f1b3ac.tar.lz
pyhegp-a64055f237800b4e1427f566ebdb6c3875f1b3ac.zip
Merge, not concat, genotype frames.
pd.concat duplicates the metadata columns, and is generally the wrong
approach to the problem.
-rw-r--r--pyhegp/pyhegp.py10
-rw-r--r--tests/test_pyhegp.py2
2 files changed, 8 insertions, 4 deletions
diff --git a/pyhegp/pyhegp.py b/pyhegp/pyhegp.py
index f204a36..98e904c 100644
--- a/pyhegp/pyhegp.py
+++ b/pyhegp/pyhegp.py
@@ -26,7 +26,7 @@ import numpy as np
 import pandas as pd
 from scipy.stats import special_ortho_group
 
-from pyhegp.serialization import Summary, read_summary, write_summary, read_genotype, write_genotype, write_key
+from pyhegp.serialization import Summary, read_summary, write_summary, read_genotype, write_genotype, write_key, is_genotype_metadata_column
 
 Stats = namedtuple("Stats", "n mean std")
 
@@ -111,6 +111,12 @@ def encrypt_genotype(genotype, key, summary):
                      axis="columns")
 
 def cat_genotype(genotypes):
+    def cat2(genotype1, genotype2):
+        return pd.merge(genotype1, genotype2,
+                        how="inner",
+                        on=list(filter(is_genotype_metadata_column,
+                                       genotype1.columns)))
+
     match genotypes:
         # If there are no input data frames, return an empty data
         # frame with the chromosome and position columns.
@@ -121,7 +127,7 @@ def cat_genotype(genotypes):
             genotype.position = genotype.position.astype("int")
             return genotype
         case _:
-            return pd.concat(genotypes)
+            return reduce(cat2, genotypes)
 
 @click.group()
 def main():
diff --git a/tests/test_pyhegp.py b/tests/test_pyhegp.py
index 7989063..88f13ef 100644
--- a/tests/test_pyhegp.py
+++ b/tests/test_pyhegp.py
@@ -26,7 +26,6 @@ from hypothesis import given, settings, strategies as st
 from hypothesis.extra.numpy import arrays, array_shapes
 import numpy as np
 import pandas as pd
-import pytest
 from pytest import approx
 
 from pyhegp.pyhegp import Stats, main, hegp_encrypt, hegp_decrypt, random_key, pool_stats, standardize, unstandardize, cat_genotype
@@ -166,7 +165,6 @@ def catenable_genotype_frames(draw):
             for start, end
             in pairwise([0] + split_points + [len(sample_names)])]
 
-@pytest.mark.xfail
 @given(catenable_genotype_frames())
 def test_cat_genotype(genotypes):
     def metadata_columns(genotype):