diff options
-rw-r--r-- | README.md | 10 | ||||
-rw-r--r-- | pyhegp/pyhegp.py | 14 | ||||
-rw-r--r-- | tests/test_pyhegp.py | 26 |
3 files changed, 27 insertions, 23 deletions
diff --git a/README.md b/README.md index 81c153c..ee7057f 100644 --- a/README.md +++ b/README.md @@ -65,9 +65,9 @@ pyhegp --help In this simple scenario, there is only one data owner and they wish to share their encrypted data with a researcher. The data owner encrypts their data with: ``` -pyhegp encrypt -o encrypted-genotype.tsv genotype.tsv +pyhegp encrypt genotype.tsv ``` -They then send the encrypted data to the researcher. Note that data sharing is carried out-of-band and is outside the scope of `pyhegp`. +They then send the encrypted data `genotype.tsv.hegp` to the researcher. Note that data sharing is carried out-of-band and is outside the scope of `pyhegp`. ## Joint/federated analysis with many data owners @@ -83,11 +83,11 @@ pyhegp pool -o complete-summary summary1 summary2 ... ``` The data broker shares these summary statistics with the data owners. The data owners standardize their data using these summary statistics, and encrypt their data using a random key. ``` -pyhegp encrypt -s complete-summary -o encrypted-genotype.tsv genotype.tsv +pyhegp encrypt -s complete-summary genotype.tsv ``` -Finally, the data owners share the encrypted data with the broker who concatenates it and shares it with all parties. +Finally, the data owners share the encrypted data `genotype.tsv.hegp` with the broker who concatenates it and shares it with all parties. ``` -pyhegp cat -o complete-encrypted-genotype.tsv encrypted-genotype1.tsv encrypted-genotype2.tsv ... +pyhegp cat -o complete-genotype.tsv.hegp genotype1.tsv.hegp genotype2.tsv.hegp ... ``` Note that all data sharing is carried out-of-band and is outside the scope of `pyhegp`. diff --git a/pyhegp/pyhegp.py b/pyhegp/pyhegp.py index 650b1f8..4db76ea 100644 --- a/pyhegp/pyhegp.py +++ b/pyhegp/pyhegp.py @@ -18,6 +18,8 @@ from collections import namedtuple from functools import reduce +from pathlib import Path +import sys import click import numpy as np @@ -143,10 +145,7 @@ def pool(pooled_summary_file, summary_files): help="Summary statistics file") @click.option("--key", "-k", "key_file", type=click.File("w"), help="Output key") -@click.option("--output", "-o", "ciphertext_file", type=click.File("w"), - default="-", - help="Output ciphertext") -def encrypt(genotype_file, summary_file, key_file, ciphertext_file): +def encrypt(genotype_file, summary_file, key_file): genotype = read_genotype(genotype_file) if summary_file: summary = read_summary(summary_file) @@ -162,7 +161,12 @@ def encrypt(genotype_file, summary_file, key_file, ciphertext_file): if len(encrypted_genotype) < len(genotype): dropped_snps = len(genotype) - len(encrypted_genotype) print(f"Dropped {dropped_snps} SNP(s)") - write_genotype(ciphertext_file, encrypted_genotype) + ciphertext_path = Path(genotype_file.name + ".hegp") + if ciphertext_path.exists(): + print(f"Output file {ciphertext_path} exists, cannot overwrite.") + sys.exit(1) + with ciphertext_path.open("w") as ciphertext_file: + write_genotype(ciphertext_file, encrypted_genotype) @main.command() @click.option("--output", "-o", "output_file", diff --git a/tests/test_pyhegp.py b/tests/test_pyhegp.py index 61f65cd..0a62ad1 100644 --- a/tests/test_pyhegp.py +++ b/tests/test_pyhegp.py @@ -18,6 +18,7 @@ import math from pathlib import Path +import shutil from click.testing import CliRunner from hypothesis import given, settings, strategies as st @@ -50,11 +51,11 @@ def test_pool_stats(pools): rel=1e-6)) def test_encrypt(tmp_path): - ciphertext = tmp_path / "encrypted-genotype.tsv" + shutil.copy("test-data/encrypt-test-genotype.tsv", tmp_path) + ciphertext = tmp_path / "encrypt-test-genotype.tsv.hegp" result = CliRunner().invoke(main, ["encrypt", "-s", "test-data/encrypt-test-summary", - "-o", ciphertext, - "test-data/encrypt-test-genotype.tsv"]) + str(tmp_path / "encrypt-test-genotype.tsv")]) assert result.exit_code == 0 assert ciphertext.exists() assert "Dropped 1 SNP(s)" in result.output @@ -143,20 +144,20 @@ def test_pool(tmp_path): assert pooled_summary.n == expected_pooled_summary.n def test_simple_workflow(tmp_path): - ciphertext = tmp_path / "encrypted_genotype.tsv" + shutil.copy(f"test-data/genotype.tsv", tmp_path) + ciphertext = tmp_path / "genotype.tsv.hegp" result = CliRunner().invoke(main, - ["encrypt", - "-o", ciphertext, - "test-data/genotype.tsv"]) + ["encrypt", str(tmp_path / "genotype.tsv")]) assert result.exit_code == 0 assert ciphertext.exists() def test_joint_workflow(tmp_path): runner = CliRunner() for i in range(4): + shutil.copy(f"test-data/genotype{i}.tsv", tmp_path) summary = tmp_path / f"summary{i}" result = runner.invoke( - main, ["summary", f"test-data/genotype{i}.tsv", + main, ["summary", str(tmp_path / f"genotype{i}.tsv"), "-o", summary]) assert result.exit_code == 0 assert summary.exists() @@ -168,18 +169,17 @@ def test_joint_workflow(tmp_path): assert result.exit_code == 0 assert complete_summary.exists() for i in range(4): - ciphertext = tmp_path / f"encrypted-genotype{i}.tsv" + ciphertext = tmp_path / f"genotype{i}.tsv.hegp" result = runner.invoke( main, ["encrypt", "-s", complete_summary, - "-o", ciphertext, - f"test-data/genotype{i}.tsv"]) + str(tmp_path / f"genotype{i}.tsv")]) assert result.exit_code == 0 assert ciphertext.exists() - complete_ciphertext = tmp_path / "complete-encrypted-genotype.tsv" + complete_ciphertext = tmp_path / "complete-genotype.tsv.hegp" result = runner.invoke( main, ["cat", "-o", complete_ciphertext, - *(str(tmp_path / f"encrypted-genotype{i}.tsv") for i in range(4))]) + *(str(tmp_path / f"genotype{i}.tsv.hegp") for i in range(4))]) assert result.exit_code == 0 assert complete_ciphertext.exists() |