isamplesorg · rdhyee · Jun 14, 2026 · Jun 14, 2026
diff --git a/Makefile b/Makefile
@@ -31,7 +31,7 @@ VALIDATE := scripts/validate_frontend_derived.py
 ENRICH  := scripts/enrich_wide_with_oc_concepts.py
 VALIDATE_ENRICH := scripts/validate_oc_concept_enrichment.py
 
-.PHONY: help test wide oc-wide enrich validate-enrich derived validate all all-272 clean
+.PHONY: help test wide oc-wide enrich validate-enrich derived validate all all-272 ingest-272 all-202608 clean
 help:
 	@grep -E '^#   make' Makefile | sed 's/^#   /  /'
 
@@ -81,5 +81,30 @@ all-272: validate-enrich
 	$(MAKE) derived DERIVED_WIDE=$(ENRICHED) TAG=$(TAG)
 	$(MAKE) validate TAG=$(TAG) SENTINEL_FLAG=
 
+# TRUE SYNC ingestion: add 67,187 new OC pids + remove 21,227 stale OC pids (#272 Phase 2, D3).
+# Requires the 202606 wide (--src) and Eric's OC wide (--oc-wide).
+# Outputs a 202608-tagged wide parquet + derived files in $(OUTDIR).
+#
+#   make ingest-272 \
+#     SRC_202606=~/Data/iSample/pqg_refining/isamples_202606_wide.parquet \
+#     OC_WIDE_2026=~/Data/iSample/pqg_refining/oc_isamples_pqg_wide_2026-06-09.parquet
+#
+INGEST_TAG ?= isamples_202608
+SRC_202606 ?= $(OUTDIR)/isamples_202606_wide.parquet
+OC_WIDE_2026 ?= $(OUTDIR)/oc_isamples_pqg_wide_2026-06-09.parquet
+INGEST_OUT ?= $(OUTDIR)/$(INGEST_TAG)_wide.parquet
+INGEST := scripts/ingest_oc_records.py
+
+$(INGEST_OUT): $(SRC_202606) $(OC_WIDE_2026)
+	@mkdir -p $(OUTDIR)
+	$(PY) $(INGEST) --src $(SRC_202606) --oc-wide $(OC_WIDE_2026) --out $(INGEST_OUT)
+
+ingest-272: $(INGEST_OUT)
+
+# Full 202608 pipeline: ingest -> derived -> validate
+all-202608: ingest-272
+	$(MAKE) derived DERIVED_WIDE=$(INGEST_OUT) TAG=$(INGEST_TAG)
+	$(MAKE) validate TAG=$(INGEST_TAG) SENTINEL_FLAG=
+
 clean:
 	rm -rf $(OUTDIR)
diff --git a/explorer.qmd b/explorer.qmd
@@ -12,9 +12,9 @@ format:
     include-in-header:
       text: |
         <link rel="preconnect" href="https://data.isamples.org" crossorigin>
-        <link rel="preload" as="fetch" crossorigin="anonymous" href="https://data.isamples.org/isamples_202606_h3_summary_res4.parquet">
-        <link rel="preload" as="fetch" crossorigin="anonymous" href="https://data.isamples.org/isamples_202606_facet_summaries.parquet">
-        <link rel="preload" as="fetch" crossorigin="anonymous" href="https://data.isamples.org/vocab_labels.parquet">
+        <link rel="preload" as="fetch" crossorigin="anonymous" href="https://data.isamples.org/isamples_202608_h3_summary_res4.parquet">
+        <link rel="preload" as="fetch" crossorigin="anonymous" href="https://data.isamples.org/isamples_202608_facet_summaries.parquet">
+        <link rel="preload" as="fetch" crossorigin="anonymous" href="https://data.isamples.org/vocab_labels_202608.parquet">
 ---
 
 ```{=html}
@@ -749,23 +749,23 @@ R2_BASE = (() => {
     // default and absolute overrides (http://localhost:8099/data) pass through.
     return raw.startsWith('/') ? new URL(raw, location.origin).href : raw;
 })()
-h3_res4_url = `${R2_BASE}/isamples_202606_h3_summary_res4.parquet`
-h3_res6_url = `${R2_BASE}/isamples_202606_h3_summary_res6.parquet`
-h3_res8_url = `${R2_BASE}/isamples_202606_h3_summary_res8.parquet`
-lite_url = `${R2_BASE}/isamples_202606_samples_map_lite.parquet`
+h3_res4_url = `${R2_BASE}/isamples_202608_h3_summary_res4.parquet`
+h3_res6_url = `${R2_BASE}/isamples_202608_h3_summary_res6.parquet`
+h3_res8_url = `${R2_BASE}/isamples_202608_h3_summary_res8.parquet`
+lite_url = `${R2_BASE}/isamples_202608_samples_map_lite.parquet`
 // Explicit versioned wide (#272: OC concept-enriched — popups read material/
 // object-type from this file). The stable alias `current/wide.parquet` still
 // points at the previous wide until the production cutover flips the manifest;
 // pinning the version here keeps staging and prod each self-consistent.
-wide_url = `${R2_BASE}/isamples_202606_wide.parquet`
+wide_url = `${R2_BASE}/isamples_202608_wide.parquet`
 // v2 carries object_type alongside material and context (URI-string columns).
-facets_url = `${R2_BASE}/isamples_202606_sample_facets_v2.parquet`
-facet_summaries_url = `${R2_BASE}/isamples_202606_facet_summaries.parquet`
+facets_url = `${R2_BASE}/isamples_202608_sample_facets_v3.parquet`
+facet_summaries_url = `${R2_BASE}/isamples_202608_facet_summaries.parquet`
 // Pre-aggregated single-filter cache for fast cross-filtered facet counts.
-cross_filter_url = `${R2_BASE}/isamples_202606_facet_cross_filter.parquet`
+cross_filter_url = `${R2_BASE}/isamples_202608_facet_cross_filter.parquet`
 // SKOS prefLabels for Material / Sampled Feature / Specimen Type URIs.
 // ~60 KB lookup; falls back to URI tail if a URI isn't covered.
-vocab_labels_url = `${R2_BASE}/vocab_labels.parquet`
+vocab_labels_url = `${R2_BASE}/vocab_labels_202608.parquet`
 
 // Canonical palette — see issue #113. Path-relative so this works under
 // both isamples.org (custom domain at root) and project-pages fork

diff --git a/scripts/build_frontend_derived.py b/scripts/build_frontend_derived.py
@@ -13,7 +13,7 @@
   live in `p__has_{material,context,sample_object}_category` row-id arrays.
 
 OUTPUTS (into --outdir, prefixed --tag):
-  - {tag}_sample_facets_v2.parquet   pid, source, material, context, object_type, label, description, place_name(VARCHAR)
+  - {tag}_sample_facets_v2.parquet   pid, source, material, context, object_type, label, description (search-only; includes appended concept labels), place_name(VARCHAR)
   - {tag}_samples_map_lite.parquet   pid, label, source, latitude, longitude, place_name(VARCHAR[]), result_time, h3_res8(UBIGINT), h3_res8_hex
   - {tag}_h3_summary_res{4,6,8}.parquet  h3_cell(UBIGINT), sample_count(INT), center_lat, center_lng, dominant_source, source_count(INT), resolution(INT)
   - {tag}_facet_summaries.parquet    facet_type, facet_value, scheme, count
@@ -48,6 +48,21 @@
 ARTIFACTS = ["sample_facets_v2", "samples_map_lite", "h3_summaries",
              "facet_summaries", "facet_cross_filter", "wide_h3"]
 
+# Shared SQL expression for sample_facets_v2.description (#277 part 2).
+# Appends space-joined concept labels (IC labels across all 4 concept dims)
+# to the raw description so full-text search matches concept terms even when
+# they don't appear in label/description/place_name.  description is
+# SEARCH-ONLY in facets_v2 — display reads from the wide parquet.
+# Used by build_sample_facets_v2 AND the validator's --wide semantic gate so
+# they can never drift from each other.
+FACETS_DESCRIPTION_EXPR = (
+    "CASE"
+    "  WHEN concept_labels IS NOT NULL AND TRIM(concept_labels) != ''"
+    "  THEN COALESCE(description, '') || ' ' || concept_labels"
+    "  ELSE description"
+    " END"
+)
+
 
 def log(msg, t0):
     print(f"[{time.time()-t0:6.1f}s] {msg}", flush=True)
@@ -77,8 +92,12 @@ def geometry_expr(con, wide):
 def build_base_tables(con, wide, t0):
     geom = geometry_expr(con, wide)
     con.execute(f"""
+    -- ic: concept lookup for facet resolution and label aggregation.
+    -- label is included so concept_labels can aggregate human-readable text
+    -- directly from the wide without a second scan.
     CREATE OR REPLACE TEMP TABLE ic AS
-      SELECT row_id, pid AS uri FROM read_parquet('{wide}') WHERE otype='IdentifiedConcept';
+      SELECT row_id, pid AS uri, label
+      FROM read_parquet('{wide}') WHERE otype='IdentifiedConcept';
 
     -- material: FIRST NON-ROOT concept per sample. Decorrelated (unnest+join+
     -- arg_min by array ordinality) — NOT a correlated subquery and NOT a MAP
@@ -95,6 +114,39 @@ def build_base_tables(con, wide, t0):
       WHERE ic.uri <> '{MATERIAL_ROOT}'
       GROUP BY ex.pid;
 
+    -- concept_labels: one row per MSR pid; concept_labels is a space-joined
+    -- string of all DISTINCT non-null IC labels referenced across
+    -- p__has_material_category, p__has_sample_object_type,
+    -- p__has_context_category, and p__keywords.  Appended (search-only) into
+    -- sample_facets_v2.description so full-text searches like "pottery cyprus"
+    -- match samples tagged with a pottery concept even if the word doesn't
+    -- appear in their label/description/place_name.  facets_v2.description is
+    -- SEARCH-ONLY; display always reads description from the wide parquet.
+    CREATE OR REPLACE TEMP TABLE concept_labels AS
+      WITH all_refs AS (
+        SELECT s.pid, u.rid
+        FROM read_parquet('{wide}') s, UNNEST(s.p__has_material_category) AS u(rid)
+        WHERE s.otype='MaterialSampleRecord'
+        UNION ALL
+        SELECT s.pid, u.rid
+        FROM read_parquet('{wide}') s, UNNEST(s.p__has_sample_object_type) AS u(rid)
+        WHERE s.otype='MaterialSampleRecord'
+        UNION ALL
+        SELECT s.pid, u.rid
+        FROM read_parquet('{wide}') s, UNNEST(s.p__has_context_category) AS u(rid)
+        WHERE s.otype='MaterialSampleRecord'
+        UNION ALL
+        SELECT s.pid, u.rid
+        FROM read_parquet('{wide}') s, UNNEST(s.p__keywords) AS u(rid)
+        WHERE s.otype='MaterialSampleRecord'
+      )
+      SELECT r.pid,
+             string_agg(DISTINCT ic.label, ' ' ORDER BY ic.label) AS concept_labels
+      FROM all_refs r
+      JOIN ic ON ic.row_id = r.rid
+      WHERE ic.label IS NOT NULL AND TRIM(ic.label) != ''
+      GROUP BY r.pid;
+
     -- one row per MaterialSampleRecord; all concept resolution via JOINs (decorrelated).
     CREATE OR REPLACE TEMP TABLE samp AS
       SELECT
@@ -108,11 +160,13 @@ def build_base_tables(con, wide, t0):
         ROUND(ST_X({geom}), 6)           AS longitude,
         mat.material                     AS material,
         ctx.uri                          AS context,
-        obj.uri                          AS object_type
+        obj.uri                          AS object_type,
+        cl.concept_labels                AS concept_labels
       FROM read_parquet('{wide}') s
       LEFT JOIN mat ON mat.pid = s.pid
       LEFT JOIN ic AS ctx ON ctx.row_id = s.p__has_context_category[1]
       LEFT JOIN ic AS obj ON obj.row_id = s.p__has_sample_object_type[1]
+      LEFT JOIN concept_labels cl ON cl.pid = s.pid
       WHERE s.otype='MaterialSampleRecord';
 
     CREATE OR REPLACE TEMP TABLE samp_geo AS
@@ -136,8 +190,19 @@ def build_base_tables(con, wide, t0):
 
 
 def build_sample_facets_v2(con, out):
+    # description is SEARCH-ONLY in sample_facets_v2: the explorer reads
+    # description for display from the wide parquet (self-join on pid), never
+    # from facets_v2.  We append the space-joined concept labels of every
+    # IdentifiedConcept referenced by this sample (p__has_material_category,
+    # p__has_sample_object_type, p__has_context_category, p__keywords) so that
+    # full-text searches like "pottery cyprus" match samples tagged with a pottery
+    # concept even when the word doesn't appear in label/description/place_name.
+    # The wide's IdentifiedConcept.label is used directly (covers minted keyword
+    # concepts such as British Museum thesaurus terms that are absent from
+    # vocab_labels.parquet). See issue #277 part 2.
     con.execute(f"""COPY (
-        SELECT pid, source, material, context, object_type, label, description,
+        SELECT pid, source, material, context, object_type, label,
+               {FACETS_DESCRIPTION_EXPR} AS description,
                place_name::VARCHAR AS place_name
         FROM samp_geo ORDER BY pid
     ) TO '{out}' (FORMAT PARQUET, COMPRESSION ZSTD)""")
@@ -185,7 +250,7 @@ def build_h3_summary(con, out, res):
 
 def build_facet_summaries(con, out):
     union = " UNION ALL ".join(
-        f"SELECT '{d}' AS facet_type, {d} AS facet_value FROM samp_geo WHERE {d} IS NOT NULL"
+        f"SELECT '{d}' AS facet_type, {d} AS facet_value FROM samp_geo WHERE NULLIF(TRIM({d}), '') IS NOT NULL"
         for d in FACET_DIMS)
     con.execute(f"""COPY (
         SELECT facet_type, facet_value, NULL::INTEGER AS scheme, COUNT(*) AS count
@@ -206,15 +271,15 @@ def build_facet_cross_filter(con, out):
             f"SELECT NULL::VARCHAR AS filter_source, NULL::VARCHAR AS filter_material, "
             f"NULL::VARCHAR AS filter_context, NULL::VARCHAR AS filter_object_type, "
             f"'{fd}' AS facet_type, {fd} AS facet_value, COUNT(*) AS count "
-            f"FROM samp_geo WHERE {fd} IS NOT NULL GROUP BY {fd}")
+            f"FROM samp_geo WHERE NULLIF(TRIM({fd}), '') IS NOT NULL GROUP BY {fd}")
     for filt in FACET_DIMS:
         for fd in FACET_DIMS:
             cols = ", ".join(
                 (f"{filt} AS filter_{c}" if c == filt else f"NULL::VARCHAR AS filter_{c}")
                 for c in FACET_DIMS)
             selects.append(
                 f"SELECT {cols}, '{fd}' AS facet_type, {fd} AS facet_value, COUNT(*) AS count "
-                f"FROM samp_geo WHERE {filt} IS NOT NULL AND {fd} IS NOT NULL GROUP BY {filt}, {fd}")
+                f"FROM samp_geo WHERE NULLIF(TRIM({filt}), '') IS NOT NULL AND NULLIF(TRIM({fd}), '') IS NOT NULL GROUP BY {filt}, {fd}")
     con.execute(f"""COPY (
         SELECT filter_source, filter_material, filter_context, filter_object_type,
                facet_type, facet_value, count

diff --git a/scripts/build_vocab_labels.py b/scripts/build_vocab_labels.py
@@ -66,6 +66,27 @@
 
 PREFERRED_LANG = "en"
 
+# Deprecated / legacy concept URIs that are absent from the live SKOS TTLs but
+# still appear in older source data (e.g. SESAR records using the specimentype/1.0
+# namespace, superseded by materialsampleobjecttype/1.0). These rows are injected
+# directly so the Explorer can display human-readable labels instead of raw URI
+# path tails. Each entry: (uri, pref_label, lang, scheme).
+# Issue #283b: 169 SESAR records carry these deprecated URIs.
+MANUAL_LABEL_OVERRIDES: list[tuple[str, str, str, str | None]] = [
+    (
+        "https://w3id.org/isample/vocabulary/specimentype/1.0/othersolidobject",
+        "Other solid object",
+        "en",
+        "https://w3id.org/isample/vocabulary/specimentype/1.0/",
+    ),
+    (
+        "https://w3id.org/isample/vocabulary/specimentype/1.0/physicalspecimen",
+        "Material sample",
+        "en",
+        "https://w3id.org/isample/vocabulary/specimentype/1.0/",
+    ),
+]
+
 # When a concept URI is declared in more than one TTL, prefer the row whose
 # source TTL's URL contains one of these path fragments. The fragments are
 # matched against the concept URI: a URI containing "vocabulary/material/"
@@ -286,6 +307,23 @@ def main(argv: list[str] | None = None) -> int:
         print("ERROR: no rows extracted; aborting.", file=sys.stderr)
         return 2
 
+    # Inject manual overrides for deprecated URIs not present in any live TTL.
+    # These are appended before dedupe so _dedupe can merge them if they ever
+    # appear in a future TTL revision, and so _emit_data_form_aliases does NOT
+    # re-emit them (they already carry the /1.0/ version segment).
+    for uri, label, lang, scheme in MANUAL_LABEL_OVERRIDES:
+        all_rows.append({
+            "uri": uri,
+            "uri_form": "data_v1",   # already in the /1.0/ data form
+            "pref_label": label,
+            "lang": lang,
+            "scheme": scheme,
+            "definition": None,
+            "alt_labels": [],
+            "source_ttl": "manual_override",
+        })
+    print(f"  {len(MANUAL_LABEL_OVERRIDES):>4} rows  (manual overrides for deprecated URIs)")
+
     raw_count = len(all_rows)
     all_rows = _dedupe(all_rows)
     deduped_collapsed = raw_count - len(all_rows)