From 6438b08d3a6308b84e3588592e060be0149cba5f Mon Sep 17 00:00:00 2001 From: dariarom94 Date: Tue, 23 Jun 2026 22:53:49 +0200 Subject: [PATCH 1/6] fix processing --- src/data_processors/process_dataset/script.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index 5976079d..533a0c7f 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -251,6 +251,9 @@ def subsample_adata_group_balanced(adata, group_key, n_samples, seed=0): filter_table=True, ) rechunk_sdata(sdata_output) #NOTE: rechunking currently needed (https://github.com/scverse/spatialdata/issues/929) + # metadata is dataset-level, not spatial — re-add it if the bounding_box query dropped it + if "metadata" in sdata.tables and "metadata" not in sdata_output.tables: + sdata_output["metadata"] = sdata.tables["metadata"] else: sdata_output = sdata From da648f63c158d8f69383e9ebe9dcc3ecb2a8bed4 Mon Sep 17 00:00:00 2001 From: dariarom94 Date: Tue, 23 Jun 2026 22:54:06 +0200 Subject: [PATCH 2/6] adjust link --- scripts/create_resources/spatial/process_10x_atera_nebius.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/create_resources/spatial/process_10x_atera_nebius.sh b/scripts/create_resources/spatial/process_10x_atera_nebius.sh index baf016a0..d3475e58 100644 --- a/scripts/create_resources/spatial/process_10x_atera_nebius.sh +++ b/scripts/create_resources/spatial/process_10x_atera_nebius.sh @@ -14,7 +14,7 @@ cat > /tmp/params_atera.yaml << HERE param_list: - id: "10x_atera/2026_10x_human_breast_cancer_atera" - input: https://s3-us-west-2.amazonaws.com/10x.files/samples/atera/dev/WTA_Preview_FFPE_Breast_Cancer/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip + input: https://s3-us-west-2.amazonaws.com/10x.files/samples/atera/dev/WTA_Preview_FFPE_Breast_Cancer/WTA_Preview_FFPE_Breast_Cancer_outs.zip dataset_name: "Atera WTA FFPE Human Breast Cancer" dataset_url: "https://www.10xgenomics.com/datasets/atera-wta-ffpe-human-breast-cancer" dataset_summary: "Preview dataset showcasing the pre-commercial Atera Whole Transcriptome Assay (WTA) applied to FFPE human breast cancer tissue, profiling 18,028 genes and detecting 170,057 cells." From a7ddc04223512c5f41d75bf1e07300c2d6c8f1de Mon Sep 17 00:00:00 2001 From: dariarom94 Date: Tue, 23 Jun 2026 22:54:32 +0200 Subject: [PATCH 3/6] adjust memory --- src/datasets/workflows/process_tenx_atera/config.vsh.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/workflows/process_tenx_atera/config.vsh.yaml b/src/datasets/workflows/process_tenx_atera/config.vsh.yaml index 5b43ab44..58148a02 100644 --- a/src/datasets/workflows/process_tenx_atera/config.vsh.yaml +++ b/src/datasets/workflows/process_tenx_atera/config.vsh.yaml @@ -82,4 +82,4 @@ dependencies: runners: - type: nextflow directives: - label: [highcpu, midmem, hightime] \ No newline at end of file + label: [highcpu, highmem, hightime] \ No newline at end of file From f18a199c6f81615a56eedb6f28a40192f0ec04ac Mon Sep 17 00:00:00 2001 From: dariarom94 Date: Wed, 24 Jun 2026 15:10:26 +0200 Subject: [PATCH 4/6] basic TA bug fix --- .../basic_transcript_assignment/script.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/methods_transcript_assignment/basic_transcript_assignment/script.py b/src/methods_transcript_assignment/basic_transcript_assignment/script.py index 0850c0f9..0e3347c0 100644 --- a/src/methods_transcript_assignment/basic_transcript_assignment/script.py +++ b/src/methods_transcript_assignment/basic_transcript_assignment/script.py @@ -33,6 +33,8 @@ assert par['coordinate_system'] in segmentation_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data." print('Transforming transcripts coordinates', flush=True) +# reset_index avoids materializing the parquet-backed dask index inside sd.transform +sdata[par['transcripts_key']] = sdata[par['transcripts_key']].reset_index(drop=True) transcripts = sd.transform(sdata[par['transcripts_key']], to_coordinate_system=par['coordinate_system']) # In case of a translation transformation of the segmentation (e.g. crop of the data), we need to adjust the transcript coordinates From 008d11b734cb72ff006db74d8c488bb662880461 Mon Sep 17 00:00:00 2001 From: dariarom94 Date: Wed, 24 Jun 2026 15:10:48 +0200 Subject: [PATCH 5/6] adjust mem --- src/metrics/similarity/config.vsh.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/metrics/similarity/config.vsh.yaml b/src/metrics/similarity/config.vsh.yaml index 79faa93b..7a8dde9b 100644 --- a/src/metrics/similarity/config.vsh.yaml +++ b/src/metrics/similarity/config.vsh.yaml @@ -101,4 +101,4 @@ runners: # Allows turning the component into a Nextflow module / pipeline. - type: nextflow directives: - label: [midtime, veryhighmem, midcpu] + label: [midtime, highmem, midcpu] From 364f03e90886aa066e4f6180dacfb7d609504923 Mon Sep 17 00:00:00 2001 From: dariarom94 Date: Wed, 24 Jun 2026 15:38:34 +0200 Subject: [PATCH 6/6] fix .attrs issue in spatialdata --- .../basic_transcript_assignment/script.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/methods_transcript_assignment/basic_transcript_assignment/script.py b/src/methods_transcript_assignment/basic_transcript_assignment/script.py index 0e3347c0..e28fed31 100644 --- a/src/methods_transcript_assignment/basic_transcript_assignment/script.py +++ b/src/methods_transcript_assignment/basic_transcript_assignment/script.py @@ -33,9 +33,16 @@ assert par['coordinate_system'] in segmentation_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data." print('Transforming transcripts coordinates', flush=True) -# reset_index avoids materializing the parquet-backed dask index inside sd.transform -sdata[par['transcripts_key']] = sdata[par['transcripts_key']].reset_index(drop=True) -transcripts = sd.transform(sdata[par['transcripts_key']], to_coordinate_system=par['coordinate_system']) +# Parquet partitions each start from index 0, causing duplicate index values in the +# combined dask DataFrame. sd.transform() internally builds pd.Series(..., index=transformed.index) +# which fails with "cannot reindex on an axis with duplicate labels". +# Fix: reset to a global monotonic index before transforming; restore attrs explicitly +# because reset_index() drops them, which would break spatialdata's PointsModel check. +# The original sdata[transcripts_key] is left unchanged so lines below remain consistent. +transcripts_input = sdata[par['transcripts_key']] +transcripts_reset = transcripts_input.reset_index(drop=True) +transcripts_reset.attrs.update(transcripts_input.attrs) +transcripts = sd.transform(transcripts_reset, to_coordinate_system=par['coordinate_system']) # In case of a translation transformation of the segmentation (e.g. crop of the data), we need to adjust the transcript coordinates trans = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True)[par['coordinate_system']].inverse()