about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--README.md10
-rw-r--r--pyhegp/pyhegp.py14
-rw-r--r--tests/test_pyhegp.py26
3 files changed, 27 insertions, 23 deletions
diff --git a/README.md b/README.md
index 81c153c..ee7057f 100644
--- a/README.md
+++ b/README.md
@@ -65,9 +65,9 @@ pyhegp --help
 
 In this simple scenario, there is only one data owner and they wish to share their encrypted data with a researcher. The data owner encrypts their data with:
 ```
-pyhegp encrypt -o encrypted-genotype.tsv genotype.tsv
+pyhegp encrypt genotype.tsv
 ```
-They then send the encrypted data to the researcher. Note that data sharing is carried out-of-band and is outside the scope of `pyhegp`.
+They then send the encrypted data `genotype.tsv.hegp` to the researcher. Note that data sharing is carried out-of-band and is outside the scope of `pyhegp`.
 
 ## Joint/federated analysis with many data owners
 
@@ -83,11 +83,11 @@ pyhegp pool -o complete-summary summary1 summary2 ...
 ```
 The data broker shares these summary statistics with the data owners. The data owners standardize their data using these summary statistics, and encrypt their data using a random key.
 ```
-pyhegp encrypt -s complete-summary -o encrypted-genotype.tsv genotype.tsv
+pyhegp encrypt -s complete-summary genotype.tsv
 ```
-Finally, the data owners share the encrypted data with the broker who concatenates it and shares it with all parties.
+Finally, the data owners share the encrypted data `genotype.tsv.hegp` with the broker who concatenates it and shares it with all parties.
 ```
-pyhegp cat -o complete-encrypted-genotype.tsv encrypted-genotype1.tsv encrypted-genotype2.tsv ...
+pyhegp cat -o complete-genotype.tsv.hegp genotype1.tsv.hegp genotype2.tsv.hegp ...
 ```
 Note that all data sharing is carried out-of-band and is outside the scope of `pyhegp`.
 
diff --git a/pyhegp/pyhegp.py b/pyhegp/pyhegp.py
index 650b1f8..4db76ea 100644
--- a/pyhegp/pyhegp.py
+++ b/pyhegp/pyhegp.py
@@ -18,6 +18,8 @@
 
 from collections import namedtuple
 from functools import reduce
+from pathlib import Path
+import sys
 
 import click
 import numpy as np
@@ -143,10 +145,7 @@ def pool(pooled_summary_file, summary_files):
               help="Summary statistics file")
 @click.option("--key", "-k", "key_file", type=click.File("w"),
               help="Output key")
-@click.option("--output", "-o", "ciphertext_file", type=click.File("w"),
-              default="-",
-              help="Output ciphertext")
-def encrypt(genotype_file, summary_file, key_file, ciphertext_file):
+def encrypt(genotype_file, summary_file, key_file):
     genotype = read_genotype(genotype_file)
     if summary_file:
         summary = read_summary(summary_file)
@@ -162,7 +161,12 @@ def encrypt(genotype_file, summary_file, key_file, ciphertext_file):
     if len(encrypted_genotype) < len(genotype):
         dropped_snps = len(genotype) - len(encrypted_genotype)
         print(f"Dropped {dropped_snps} SNP(s)")
-    write_genotype(ciphertext_file, encrypted_genotype)
+    ciphertext_path = Path(genotype_file.name + ".hegp")
+    if ciphertext_path.exists():
+        print(f"Output file {ciphertext_path} exists, cannot overwrite.")
+        sys.exit(1)
+    with ciphertext_path.open("w") as ciphertext_file:
+        write_genotype(ciphertext_file, encrypted_genotype)
 
 @main.command()
 @click.option("--output", "-o", "output_file",
diff --git a/tests/test_pyhegp.py b/tests/test_pyhegp.py
index 61f65cd..0a62ad1 100644
--- a/tests/test_pyhegp.py
+++ b/tests/test_pyhegp.py
@@ -18,6 +18,7 @@
 
 import math
 from pathlib import Path
+import shutil
 
 from click.testing import CliRunner
 from hypothesis import given, settings, strategies as st
@@ -50,11 +51,11 @@ def test_pool_stats(pools):
                                            rel=1e-6))
 
 def test_encrypt(tmp_path):
-    ciphertext = tmp_path / "encrypted-genotype.tsv"
+    shutil.copy("test-data/encrypt-test-genotype.tsv", tmp_path)
+    ciphertext = tmp_path / "encrypt-test-genotype.tsv.hegp"
     result = CliRunner().invoke(main, ["encrypt",
                                        "-s", "test-data/encrypt-test-summary",
-                                       "-o", ciphertext,
-                                       "test-data/encrypt-test-genotype.tsv"])
+                                       str(tmp_path / "encrypt-test-genotype.tsv")])
     assert result.exit_code == 0
     assert ciphertext.exists()
     assert "Dropped 1 SNP(s)" in result.output
@@ -143,20 +144,20 @@ def test_pool(tmp_path):
     assert pooled_summary.n == expected_pooled_summary.n
 
 def test_simple_workflow(tmp_path):
-    ciphertext = tmp_path / "encrypted_genotype.tsv"
+    shutil.copy(f"test-data/genotype.tsv", tmp_path)
+    ciphertext = tmp_path / "genotype.tsv.hegp"
     result = CliRunner().invoke(main,
-                                ["encrypt",
-                                 "-o", ciphertext,
-                                 "test-data/genotype.tsv"])
+                                ["encrypt", str(tmp_path / "genotype.tsv")])
     assert result.exit_code == 0
     assert ciphertext.exists()
 
 def test_joint_workflow(tmp_path):
     runner = CliRunner()
     for i in range(4):
+        shutil.copy(f"test-data/genotype{i}.tsv", tmp_path)
         summary = tmp_path / f"summary{i}"
         result = runner.invoke(
-            main, ["summary", f"test-data/genotype{i}.tsv",
+            main, ["summary", str(tmp_path / f"genotype{i}.tsv"),
                    "-o", summary])
         assert result.exit_code == 0
         assert summary.exists()
@@ -168,18 +169,17 @@ def test_joint_workflow(tmp_path):
     assert result.exit_code == 0
     assert complete_summary.exists()
     for i in range(4):
-        ciphertext = tmp_path / f"encrypted-genotype{i}.tsv"
+        ciphertext = tmp_path / f"genotype{i}.tsv.hegp"
         result = runner.invoke(
             main, ["encrypt",
                    "-s", complete_summary,
-                   "-o", ciphertext,
-                   f"test-data/genotype{i}.tsv"])
+                   str(tmp_path / f"genotype{i}.tsv")])
         assert result.exit_code == 0
         assert ciphertext.exists()
-    complete_ciphertext = tmp_path / "complete-encrypted-genotype.tsv"
+    complete_ciphertext = tmp_path / "complete-genotype.tsv.hegp"
     result = runner.invoke(
         main, ["cat",
                "-o", complete_ciphertext,
-               *(str(tmp_path / f"encrypted-genotype{i}.tsv") for i in range(4))])
+               *(str(tmp_path / f"genotype{i}.tsv.hegp") for i in range(4))])
     assert result.exit_code == 0
     assert complete_ciphertext.exists()