openproblems-bio · dariarom94 · Jun 24, 2026 · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026
diff --git a/scripts/create_resources/spatial/process_10x_atera_nebius.sh b/scripts/create_resources/spatial/process_10x_atera_nebius.sh
@@ -14,7 +14,7 @@ cat > /tmp/params_atera.yaml << HERE
 param_list:
 
   - id: "10x_atera/2026_10x_human_breast_cancer_atera"
-    input: https://s3-us-west-2.amazonaws.com/10x.files/samples/atera/dev/WTA_Preview_FFPE_Breast_Cancer/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip
+    input:  https://s3-us-west-2.amazonaws.com/10x.files/samples/atera/dev/WTA_Preview_FFPE_Breast_Cancer/WTA_Preview_FFPE_Breast_Cancer_outs.zip
     dataset_name: "Atera WTA FFPE Human Breast Cancer"
     dataset_url: "https://www.10xgenomics.com/datasets/atera-wta-ffpe-human-breast-cancer"
     dataset_summary: "Preview dataset showcasing the pre-commercial Atera Whole Transcriptome Assay (WTA) applied to FFPE human breast cancer tissue, profiling 18,028 genes and detecting 170,057 cells."

diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py
@@ -251,6 +251,9 @@ def subsample_adata_group_balanced(adata, group_key, n_samples, seed=0):
         filter_table=True,
     )
     rechunk_sdata(sdata_output) #NOTE: rechunking currently needed (https://github.com/scverse/spatialdata/issues/929)
+    # metadata is dataset-level, not spatial — re-add it if the bounding_box query dropped it
+    if "metadata" in sdata.tables and "metadata" not in sdata_output.tables:
+        sdata_output["metadata"] = sdata.tables["metadata"]
 else:
     sdata_output = sdata
 

diff --git a/src/datasets/workflows/process_tenx_atera/config.vsh.yaml b/src/datasets/workflows/process_tenx_atera/config.vsh.yaml
@@ -82,4 +82,4 @@ dependencies:
 runners:
   - type: nextflow
     directives:
-      label: [highcpu, midmem, hightime]
+      label: [highcpu, highmem, hightime]
diff --git a/src/methods_transcript_assignment/basic_transcript_assignment/script.py b/src/methods_transcript_assignment/basic_transcript_assignment/script.py
@@ -33,7 +33,16 @@
 assert par['coordinate_system'] in segmentation_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data."
 
 print('Transforming transcripts coordinates', flush=True)
-transcripts = sd.transform(sdata[par['transcripts_key']], to_coordinate_system=par['coordinate_system'])
+# Parquet partitions each start from index 0, causing duplicate index values in the
+# combined dask DataFrame. sd.transform() internally builds pd.Series(..., index=transformed.index)
+# which fails with "cannot reindex on an axis with duplicate labels".
+# Fix: reset to a global monotonic index before transforming; restore attrs explicitly
+# because reset_index() drops them, which would break spatialdata's PointsModel check.
+# The original sdata[transcripts_key] is left unchanged so lines below remain consistent.
+transcripts_input = sdata[par['transcripts_key']]
+transcripts_reset = transcripts_input.reset_index(drop=True)
+transcripts_reset.attrs.update(transcripts_input.attrs)
+transcripts = sd.transform(transcripts_reset, to_coordinate_system=par['coordinate_system'])
 
 # In case of a translation transformation of the segmentation (e.g. crop of the data), we need to adjust the transcript coordinates
 trans = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True)[par['coordinate_system']].inverse()

diff --git a/src/metrics/similarity/config.vsh.yaml b/src/metrics/similarity/config.vsh.yaml
@@ -101,4 +101,4 @@ runners:
   # Allows turning the component into a Nextflow module / pipeline.
   - type: nextflow
     directives:
-      label: [midtime, veryhighmem, midcpu]
+      label: [midtime, highmem, midcpu]