From 4224fef9ddcba7cd5dc7fbd29fde907a23df15f5 Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Fri, 23 Jan 2026 15:24:56 +0000 Subject: [PATCH 1/6] Switch to real hashing --- .gitignore | 3 +++ .pre-commit-config.yaml | 3 ++- README.md | 22 +++++++++++++++++ config.EXAMPLE/.env.EXAMPLE | 5 ++++ config.EXAMPLE/exporter.env.EXAMPLE | 6 +++++ config.EXAMPLE/hasher.env.EXAMPLE | 12 ++++++---- docker-compose.yml | 2 +- pyproject.toml | 2 +- src/pseudon/hashing.py | 37 ++++++++++++++++++++++------- src/settings.py | 3 +++ 10 files changed, 79 insertions(+), 16 deletions(-) create mode 100644 config.EXAMPLE/.env.EXAMPLE diff --git a/.gitignore b/.gitignore index be3f5ce..a44b7dd 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,6 @@ wheels/ # settings files (should not be in the source tree anyway, but just in case) *.env + +# snakemake tracking files +.snakemake diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d4b0fd9..a217d81 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -33,7 +33,8 @@ repos: [ "pandas-stubs", "types-psycopg2", - "types-pika" + "types-pika", + "types-requests", ] files: src/ # a collection of sanity checks: check for merge conflicts, check the end of diff --git a/README.md b/README.md index 4743482..77dc783 100644 --- a/README.md +++ b/README.md @@ -48,18 +48,38 @@ separate to the Emap project root. ### Instructions for achieving this structure + +#### Clone repos Clone this repo (`waveform-controller`) and [PIXL](https://github.com/SAFEHR-data/PIXL), both inside your root directory. +#### make config files Set up the config files as follows: ``` mkdir config cp waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env cp waveform-controller/config.EXAMPLE/exporter.env.EXAMPLE config/settings.env cp waveform-controller/config.EXAMPLE/hasher.env.EXAMPLE config/hasher.env +cp waveform-controller/config.EXAMPLE/.env.EXAMPLE waveform-controller/.env ``` From the new config files, remove the comments telling you not to put secrets in it, as instructed. +#### fill in config files +Fill out the config, as appropriate. + +Tip: HASHER_API_LISTEN_PORT should match HASHER_API_PORT if you are running your own instance of the +PIXL hasher (as things stand, we are doing so). + +When updating to a new version of this code, you should diff the .EXAMPLE file against its live version, +eg. by running `vimdiff waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env`. + +This checks if any config options have been added/removed from the .EXAMPLE, and thus should be +added/removed from the live file. + +> [!CAUTION] +> Be careful not to copy sensitive data from the live config file to the .EXAMPLE file! + +#### make necessary directories If it doesn't already exist you should create a directory named `waveform-export` in the parent directory to store the saved waveform messages. @@ -68,6 +88,8 @@ messages. mkdir waveform-export ``` +#### run it! + Build and start the controller and exporter with docker ``` cd waveform-controller diff --git a/config.EXAMPLE/.env.EXAMPLE b/config.EXAMPLE/.env.EXAMPLE new file mode 100644 index 0000000..bd473cc --- /dev/null +++ b/config.EXAMPLE/.env.EXAMPLE @@ -0,0 +1,5 @@ +# This is an EXAMPLE file, do not put real secrets in here. +# Copy it to .env and then DELETE THIS COMMENT. +# (This is a bit different from the others, which live in config. +# This one is just for the variables in the docker compose file) +HASHER_API_LISTEN_PORT= diff --git a/config.EXAMPLE/exporter.env.EXAMPLE b/config.EXAMPLE/exporter.env.EXAMPLE index 7be714c..14a601d 100644 --- a/config.EXAMPLE/exporter.env.EXAMPLE +++ b/config.EXAMPLE/exporter.env.EXAMPLE @@ -2,7 +2,13 @@ # Copy it to ../config/exporter.env and then DELETE THIS COMMENT. # When does the exporter run EXPORTER_CRON_SCHEDULE="14 5 * * *" +# Where to upload via FTPS FTPS_HOST=myftps.example.com FTPS_PORT=990 FTPS_USERNAME= FTPS_PASSWORD= +# only run workflow up to and including the specified rule +SNAKEMAKE_RULE_UNTIL= +# point to the hasher we wish to use +HASHER_API_HOSTNAME=waveform-hasher +HASHER_API_PORT= diff --git a/config.EXAMPLE/hasher.env.EXAMPLE b/config.EXAMPLE/hasher.env.EXAMPLE index 1cd6ad0..ad79b7a 100644 --- a/config.EXAMPLE/hasher.env.EXAMPLE +++ b/config.EXAMPLE/hasher.env.EXAMPLE @@ -1,6 +1,10 @@ # This is an EXAMPLE file, do not put real secrets in here. # Copy it to ../config/hasher.env and then DELETE THIS COMMENT. -HASHER_API_AZ_CLIENT_ID= -HASHER_API_AZ_CLIENT_PASSWORD= -HASHER_API_AZ_TENANT_ID= -HASHER_API_AZ_KEY_VAULT_NAME= +AZURE_CLIENT_ID= +AZURE_CLIENT_SECRET= +AZURE_TENANT_ID= +AZURE_KEY_VAULT_NAME= +# This is the "variable name" of the actual secret, +# and can be fixed and is not secret itself +AZURE_KEY_VAULT_SECRET_NAME="waveform-secret-key" + diff --git a/docker-compose.yml b/docker-compose.yml index d20153a..56ecbb8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -47,7 +47,7 @@ services: HTTPS_PROXY: ${HTTPS_PROXY} https_proxy: ${https_proxy} ports: - - "127.0.0.1:${HASHER_API_PORT}:8000" + - "127.0.0.1:${HASHER_API_LISTEN_PORT}:8000" env_file: - ../config/hasher.env restart: unless-stopped diff --git a/pyproject.toml b/pyproject.toml index 7ee53db..bb0d238 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ dependencies = [ "snakemake==9.14.5", # need to be compatible with PIXL, which currently pins 2.9.10 (arguably it shouldn't) "psycopg2-binary>=2.9.10", - "stablehash==0.3.0", + "requests==2.32.3", # trick for making a "relative" path, works inside or outside container image "core @ file:///${PROJECT_ROOT}/../PIXL/pixl_core", ] diff --git a/src/pseudon/hashing.py b/src/pseudon/hashing.py index d7ac7b9..35bde98 100644 --- a/src/pseudon/hashing.py +++ b/src/pseudon/hashing.py @@ -1,17 +1,36 @@ +import logging from functools import lru_cache -from stablehash import stablehash +import requests -@lru_cache +import settings + +logger = logging.getLogger(__name__) + + +@lru_cache(maxsize=1000) def do_hash(type_prefix: str, value: str): - """Stub implementation of deidentification function for testing purposes. + """Pass data to the hasher API for de-identification purposes. Not that I think this will happen in practice, but we'd want the CSN "1234" to hash to a different value than the MRN "1234", so prefix each value with its type. """ - # Full implementation of issue #6 must remove this code and call the real hasher!! - SALT = "waveform-exporter" - full_value_to_hash = f"{SALT}:{type_prefix}:{value}" - full_hash = stablehash(full_value_to_hash).hexdigest() - tiny_hash = full_hash[:8] - return tiny_hash + + project_slug = "waveform-exporter" + full_value_to_hash = f"{type_prefix}:{value}" + + hasher_hostname = settings.HASHER_API_HOSTNAME + hasher_port = settings.HASHER_API_PORT + hasher_req_url = f"http://{hasher_hostname}:{hasher_port}/hash" + request_params: dict[str, str | int] = { + "project_slug": project_slug, + "message": full_value_to_hash, + } + # do we need to specify a particular hash length? + # request_params["length"] = hash_len + + response = requests.get(hasher_req_url, params=request_params) + logger.debug("RESPONSE = {}", response.text) + response.raise_for_status() + real_hash = response.text + return real_hash diff --git a/src/settings.py b/src/settings.py index dfbebc1..8aaed85 100644 --- a/src/settings.py +++ b/src/settings.py @@ -29,3 +29,6 @@ def get_from_env(env_var, *, default_value=None, setting_name=None): get_from_env("FTPS_PORT", default_value=990) get_from_env("FTPS_USERNAME") get_from_env("FTPS_PASSWORD") + +get_from_env("HASHER_API_HOSTNAME") +get_from_env("HASHER_API_PORT") From a1e63ab37256369da1708ac53860c7e34af46282 Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Fri, 23 Jan 2026 16:56:07 +0000 Subject: [PATCH 2/6] We don't actually need to listen on a specified host port because we access directly by service name on the docker network. Just let it pick an ephemeral one. --- README.md | 4 ---- config.EXAMPLE/.env.EXAMPLE | 5 ----- docker-compose.yml | 3 ++- 3 files changed, 2 insertions(+), 10 deletions(-) delete mode 100644 config.EXAMPLE/.env.EXAMPLE diff --git a/README.md b/README.md index 77dc783..97d29d7 100644 --- a/README.md +++ b/README.md @@ -60,16 +60,12 @@ mkdir config cp waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env cp waveform-controller/config.EXAMPLE/exporter.env.EXAMPLE config/settings.env cp waveform-controller/config.EXAMPLE/hasher.env.EXAMPLE config/hasher.env -cp waveform-controller/config.EXAMPLE/.env.EXAMPLE waveform-controller/.env ``` From the new config files, remove the comments telling you not to put secrets in it, as instructed. #### fill in config files Fill out the config, as appropriate. -Tip: HASHER_API_LISTEN_PORT should match HASHER_API_PORT if you are running your own instance of the -PIXL hasher (as things stand, we are doing so). - When updating to a new version of this code, you should diff the .EXAMPLE file against its live version, eg. by running `vimdiff waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env`. diff --git a/config.EXAMPLE/.env.EXAMPLE b/config.EXAMPLE/.env.EXAMPLE deleted file mode 100644 index bd473cc..0000000 --- a/config.EXAMPLE/.env.EXAMPLE +++ /dev/null @@ -1,5 +0,0 @@ -# This is an EXAMPLE file, do not put real secrets in here. -# Copy it to .env and then DELETE THIS COMMENT. -# (This is a bit different from the others, which live in config. -# This one is just for the variables in the docker compose file) -HASHER_API_LISTEN_PORT= diff --git a/docker-compose.yml b/docker-compose.yml index 56ecbb8..6c30ca7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -47,7 +47,8 @@ services: HTTPS_PROXY: ${HTTPS_PROXY} https_proxy: ${https_proxy} ports: - - "127.0.0.1:${HASHER_API_LISTEN_PORT}:8000" + # this is only here as a convenience for testing, we don't actually use it from the exporter + - "127.0.0.1::8000" env_file: - ../config/hasher.env restart: unless-stopped From 7da0c5f208952667f5fc9f64599794ce2a2510a0 Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Fri, 23 Jan 2026 17:00:18 +0000 Subject: [PATCH 3/6] Specify hasher port as default --- config.EXAMPLE/exporter.env.EXAMPLE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.EXAMPLE/exporter.env.EXAMPLE b/config.EXAMPLE/exporter.env.EXAMPLE index 14a601d..134ac38 100644 --- a/config.EXAMPLE/exporter.env.EXAMPLE +++ b/config.EXAMPLE/exporter.env.EXAMPLE @@ -11,4 +11,4 @@ FTPS_PASSWORD= SNAKEMAKE_RULE_UNTIL= # point to the hasher we wish to use HASHER_API_HOSTNAME=waveform-hasher -HASHER_API_PORT= +HASHER_API_PORT=8000 From e6aaf337e9ed4a7121e8e7da6cef2fc1b3dd669e Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Fri, 23 Jan 2026 17:20:03 +0000 Subject: [PATCH 4/6] Fix linting --- config.EXAMPLE/hasher.env.EXAMPLE | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/config.EXAMPLE/hasher.env.EXAMPLE b/config.EXAMPLE/hasher.env.EXAMPLE index ad79b7a..0631345 100644 --- a/config.EXAMPLE/hasher.env.EXAMPLE +++ b/config.EXAMPLE/hasher.env.EXAMPLE @@ -6,5 +6,4 @@ AZURE_TENANT_ID= AZURE_KEY_VAULT_NAME= # This is the "variable name" of the actual secret, # and can be fixed and is not secret itself -AZURE_KEY_VAULT_SECRET_NAME="waveform-secret-key" - +AZURE_KEY_VAULT_SECRET_NAME="waveform-secret-key" # pragma: allowlist secret From 8f3c455abdb9322ece197e597ab6cb1b97b665bc Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Mon, 26 Jan 2026 16:24:39 +0000 Subject: [PATCH 5/6] Doc tweak --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 97d29d7..0afc24a 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ Set up the config files as follows: ``` mkdir config cp waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env -cp waveform-controller/config.EXAMPLE/exporter.env.EXAMPLE config/settings.env +cp waveform-controller/config.EXAMPLE/exporter.env.EXAMPLE config/exporter.env cp waveform-controller/config.EXAMPLE/hasher.env.EXAMPLE config/hasher.env ``` From the new config files, remove the comments telling you not to put secrets in it, as instructed. From eb2c16a75d0610da63f3f4b7d5081da71d62f489 Mon Sep 17 00:00:00 2001 From: Jeremy Stein Date: Fri, 30 Jan 2026 15:21:23 +0000 Subject: [PATCH 6/6] Document how to config the hasher --- README.md | 6 ++-- config.EXAMPLE/hasher.env.EXAMPLE | 6 ++++ docs/azure_hashing.md | 48 +++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 2 deletions(-) create mode 100644 docs/azure_hashing.md diff --git a/README.md b/README.md index 0afc24a..a7df500 100644 --- a/README.md +++ b/README.md @@ -63,9 +63,11 @@ cp waveform-controller/config.EXAMPLE/hasher.env.EXAMPLE config/hasher.env ``` From the new config files, remove the comments telling you not to put secrets in it, as instructed. -#### fill in config files +#### Fill in config files Fill out the config, as appropriate. +See [azure and hasher setup](docs/azure_hashing.md) to configure the hasher. + When updating to a new version of this code, you should diff the .EXAMPLE file against its live version, eg. by running `vimdiff waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env`. @@ -86,7 +88,7 @@ mkdir waveform-export #### run it! -Build and start the controller and exporter with docker +Build and start the hasher, controller and exporter with docker. ``` cd waveform-controller docker compose build diff --git a/config.EXAMPLE/hasher.env.EXAMPLE b/config.EXAMPLE/hasher.env.EXAMPLE index 0631345..efeae85 100644 --- a/config.EXAMPLE/hasher.env.EXAMPLE +++ b/config.EXAMPLE/hasher.env.EXAMPLE @@ -1,9 +1,15 @@ # This is an EXAMPLE file, do not put real secrets in here. # Copy it to ../config/hasher.env and then DELETE THIS COMMENT. +# Details for the Azure service principal, so it can log in to the keyvault. +# aka "appId" AZURE_CLIENT_ID= +# aka "password" AZURE_CLIENT_SECRET= +# aka "tenant" AZURE_TENANT_ID= +# the name of the key vault, NOT the service principal AZURE_KEY_VAULT_NAME= + # This is the "variable name" of the actual secret, # and can be fixed and is not secret itself AZURE_KEY_VAULT_SECRET_NAME="waveform-secret-key" # pragma: allowlist secret diff --git a/docs/azure_hashing.md b/docs/azure_hashing.md new file mode 100644 index 0000000..43898fb --- /dev/null +++ b/docs/azure_hashing.md @@ -0,0 +1,48 @@ +# Setting up Azure + Hashing in Dev and Production + +# Create and configure Azure key vaults + +Azure key vaults for dev and prod already exist. Ask your team-mates how to find the details for these. + +For each, there is an Azure service principal (aka. machine account) that can read/write secrets to +the key vault. + +# Configure hasher +The hasher needs to be given the service principal details so it can create/obtain the +secrets. It also needs to know the name of the manually-created secret (see next section for more details). + +See [hasher example config](config.EXAMPLE/hasher.env.EXAMPLE) for detailed description of required env vars. + +# Manual Azure config + +There is a one-off (per key vault) step that needs to be performed manually. + +First, install the Azure CLI tools in the usual way for your OS. + +Log in using the service principal. +Do not include password on command line; let it prompt you and then paste it in. +``` +az login --service-principal --username --tenant +``` + +Now you can run commands to inspect the existing setup: +``` +# show all keyvaults +az keyvault list + +# Show keyvault details (not secrets). name is "name" key from previous command +az keyvault show --name + +# list all secrets in keyvault +az keyvault secret list --vault-name +``` +As per [PIXL instructions](https://github.com/SAFEHR-data/PIXL/blob/main/docs/setup/azure-keyvault.md#step-4), +you need to manually create a secret project-level key: +``` +az keyvault secret set --vault-name --name --value +``` +Note that you can choose the name of this secret (`` above), and its name (NOT its value) +should be placed in the config env var `AZURE_KEY_VAULT_SECRET_NAME` + +In addition, the PIXL hasher automatically creates a secret named after the "project slug" that you pass +in, the first time that you request a hash using that project slug.