Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ VALIDATE := scripts/validate_frontend_derived.py
ENRICH := scripts/enrich_wide_with_oc_concepts.py
VALIDATE_ENRICH := scripts/validate_oc_concept_enrichment.py

.PHONY: help test wide oc-wide enrich validate-enrich derived validate all all-272 clean
.PHONY: help test wide oc-wide enrich validate-enrich derived validate all all-272 ingest-272 all-202608 clean
help:
@grep -E '^# make' Makefile | sed 's/^# / /'

Expand Down Expand Up @@ -81,5 +81,30 @@ all-272: validate-enrich
$(MAKE) derived DERIVED_WIDE=$(ENRICHED) TAG=$(TAG)
$(MAKE) validate TAG=$(TAG) SENTINEL_FLAG=

# TRUE SYNC ingestion: add 67,187 new OC pids + remove 21,227 stale OC pids (#272 Phase 2, D3).
# Requires the 202606 wide (--src) and Eric's OC wide (--oc-wide).
# Outputs a 202608-tagged wide parquet + derived files in $(OUTDIR).
#
# make ingest-272 \
# SRC_202606=~/Data/iSample/pqg_refining/isamples_202606_wide.parquet \
# OC_WIDE_2026=~/Data/iSample/pqg_refining/oc_isamples_pqg_wide_2026-06-09.parquet
#
INGEST_TAG ?= isamples_202608
SRC_202606 ?= $(OUTDIR)/isamples_202606_wide.parquet
OC_WIDE_2026 ?= $(OUTDIR)/oc_isamples_pqg_wide_2026-06-09.parquet
INGEST_OUT ?= $(OUTDIR)/$(INGEST_TAG)_wide.parquet
INGEST := scripts/ingest_oc_records.py

$(INGEST_OUT): $(SRC_202606) $(OC_WIDE_2026)
@mkdir -p $(OUTDIR)
$(PY) $(INGEST) --src $(SRC_202606) --oc-wide $(OC_WIDE_2026) --out $(INGEST_OUT)

ingest-272: $(INGEST_OUT)

# Full 202608 pipeline: ingest -> derived -> validate
all-202608: ingest-272
$(MAKE) derived DERIVED_WIDE=$(INGEST_OUT) TAG=$(INGEST_TAG)
$(MAKE) validate TAG=$(INGEST_TAG) SENTINEL_FLAG=

clean:
rm -rf $(OUTDIR)
24 changes: 12 additions & 12 deletions explorer.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ format:
include-in-header:
text: |
<link rel="preconnect" href="https://data.isamples.org" crossorigin>
<link rel="preload" as="fetch" crossorigin="anonymous" href="https://data.isamples.org/isamples_202606_h3_summary_res4.parquet">
<link rel="preload" as="fetch" crossorigin="anonymous" href="https://data.isamples.org/isamples_202606_facet_summaries.parquet">
<link rel="preload" as="fetch" crossorigin="anonymous" href="https://data.isamples.org/vocab_labels.parquet">
<link rel="preload" as="fetch" crossorigin="anonymous" href="https://data.isamples.org/isamples_202608_h3_summary_res4.parquet">
<link rel="preload" as="fetch" crossorigin="anonymous" href="https://data.isamples.org/isamples_202608_facet_summaries.parquet">
<link rel="preload" as="fetch" crossorigin="anonymous" href="https://data.isamples.org/vocab_labels_202608.parquet">
---

```{=html}
Expand Down Expand Up @@ -749,23 +749,23 @@ R2_BASE = (() => {
// default and absolute overrides (http://localhost:8099/data) pass through.
return raw.startsWith('/') ? new URL(raw, location.origin).href : raw;
})()
h3_res4_url = `${R2_BASE}/isamples_202606_h3_summary_res4.parquet`
h3_res6_url = `${R2_BASE}/isamples_202606_h3_summary_res6.parquet`
h3_res8_url = `${R2_BASE}/isamples_202606_h3_summary_res8.parquet`
lite_url = `${R2_BASE}/isamples_202606_samples_map_lite.parquet`
h3_res4_url = `${R2_BASE}/isamples_202608_h3_summary_res4.parquet`
h3_res6_url = `${R2_BASE}/isamples_202608_h3_summary_res6.parquet`
h3_res8_url = `${R2_BASE}/isamples_202608_h3_summary_res8.parquet`
lite_url = `${R2_BASE}/isamples_202608_samples_map_lite.parquet`
// Explicit versioned wide (#272: OC concept-enriched — popups read material/
// object-type from this file). The stable alias `current/wide.parquet` still
// points at the previous wide until the production cutover flips the manifest;
// pinning the version here keeps staging and prod each self-consistent.
wide_url = `${R2_BASE}/isamples_202606_wide.parquet`
wide_url = `${R2_BASE}/isamples_202608_wide.parquet`
// v2 carries object_type alongside material and context (URI-string columns).
facets_url = `${R2_BASE}/isamples_202606_sample_facets_v2.parquet`
facet_summaries_url = `${R2_BASE}/isamples_202606_facet_summaries.parquet`
facets_url = `${R2_BASE}/isamples_202608_sample_facets_v3.parquet`
facet_summaries_url = `${R2_BASE}/isamples_202608_facet_summaries.parquet`
// Pre-aggregated single-filter cache for fast cross-filtered facet counts.
cross_filter_url = `${R2_BASE}/isamples_202606_facet_cross_filter.parquet`
cross_filter_url = `${R2_BASE}/isamples_202608_facet_cross_filter.parquet`
// SKOS prefLabels for Material / Sampled Feature / Specimen Type URIs.
// ~60 KB lookup; falls back to URI tail if a URI isn't covered.
vocab_labels_url = `${R2_BASE}/vocab_labels.parquet`
vocab_labels_url = `${R2_BASE}/vocab_labels_202608.parquet`

// Canonical palette — see issue #113. Path-relative so this works under
// both isamples.org (custom domain at root) and project-pages fork
Expand Down
79 changes: 72 additions & 7 deletions scripts/build_frontend_derived.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
live in `p__has_{material,context,sample_object}_category` row-id arrays.

OUTPUTS (into --outdir, prefixed --tag):
- {tag}_sample_facets_v2.parquet pid, source, material, context, object_type, label, description, place_name(VARCHAR)
- {tag}_sample_facets_v2.parquet pid, source, material, context, object_type, label, description (search-only; includes appended concept labels), place_name(VARCHAR)
- {tag}_samples_map_lite.parquet pid, label, source, latitude, longitude, place_name(VARCHAR[]), result_time, h3_res8(UBIGINT), h3_res8_hex
- {tag}_h3_summary_res{4,6,8}.parquet h3_cell(UBIGINT), sample_count(INT), center_lat, center_lng, dominant_source, source_count(INT), resolution(INT)
- {tag}_facet_summaries.parquet facet_type, facet_value, scheme, count
Expand Down Expand Up @@ -48,6 +48,21 @@
ARTIFACTS = ["sample_facets_v2", "samples_map_lite", "h3_summaries",
"facet_summaries", "facet_cross_filter", "wide_h3"]

# Shared SQL expression for sample_facets_v2.description (#277 part 2).
# Appends space-joined concept labels (IC labels across all 4 concept dims)
# to the raw description so full-text search matches concept terms even when
# they don't appear in label/description/place_name. description is
# SEARCH-ONLY in facets_v2 — display reads from the wide parquet.
# Used by build_sample_facets_v2 AND the validator's --wide semantic gate so
# they can never drift from each other.
FACETS_DESCRIPTION_EXPR = (
"CASE"
" WHEN concept_labels IS NOT NULL AND TRIM(concept_labels) != ''"
" THEN COALESCE(description, '') || ' ' || concept_labels"
" ELSE description"
" END"
)


def log(msg, t0):
print(f"[{time.time()-t0:6.1f}s] {msg}", flush=True)
Expand Down Expand Up @@ -77,8 +92,12 @@ def geometry_expr(con, wide):
def build_base_tables(con, wide, t0):
geom = geometry_expr(con, wide)
con.execute(f"""
-- ic: concept lookup for facet resolution and label aggregation.
-- label is included so concept_labels can aggregate human-readable text
-- directly from the wide without a second scan.
CREATE OR REPLACE TEMP TABLE ic AS
SELECT row_id, pid AS uri FROM read_parquet('{wide}') WHERE otype='IdentifiedConcept';
SELECT row_id, pid AS uri, label
FROM read_parquet('{wide}') WHERE otype='IdentifiedConcept';

-- material: FIRST NON-ROOT concept per sample. Decorrelated (unnest+join+
-- arg_min by array ordinality) — NOT a correlated subquery and NOT a MAP
Expand All @@ -95,6 +114,39 @@ def build_base_tables(con, wide, t0):
WHERE ic.uri <> '{MATERIAL_ROOT}'
GROUP BY ex.pid;

-- concept_labels: one row per MSR pid; concept_labels is a space-joined
-- string of all DISTINCT non-null IC labels referenced across
-- p__has_material_category, p__has_sample_object_type,
-- p__has_context_category, and p__keywords. Appended (search-only) into
-- sample_facets_v2.description so full-text searches like "pottery cyprus"
-- match samples tagged with a pottery concept even if the word doesn't
-- appear in their label/description/place_name. facets_v2.description is
-- SEARCH-ONLY; display always reads description from the wide parquet.
CREATE OR REPLACE TEMP TABLE concept_labels AS
WITH all_refs AS (
SELECT s.pid, u.rid
FROM read_parquet('{wide}') s, UNNEST(s.p__has_material_category) AS u(rid)
WHERE s.otype='MaterialSampleRecord'
UNION ALL
SELECT s.pid, u.rid
FROM read_parquet('{wide}') s, UNNEST(s.p__has_sample_object_type) AS u(rid)
WHERE s.otype='MaterialSampleRecord'
UNION ALL
SELECT s.pid, u.rid
FROM read_parquet('{wide}') s, UNNEST(s.p__has_context_category) AS u(rid)
WHERE s.otype='MaterialSampleRecord'
UNION ALL
SELECT s.pid, u.rid
FROM read_parquet('{wide}') s, UNNEST(s.p__keywords) AS u(rid)
WHERE s.otype='MaterialSampleRecord'
)
SELECT r.pid,
string_agg(DISTINCT ic.label, ' ' ORDER BY ic.label) AS concept_labels
FROM all_refs r
JOIN ic ON ic.row_id = r.rid
WHERE ic.label IS NOT NULL AND TRIM(ic.label) != ''
GROUP BY r.pid;

-- one row per MaterialSampleRecord; all concept resolution via JOINs (decorrelated).
CREATE OR REPLACE TEMP TABLE samp AS
SELECT
Expand All @@ -108,11 +160,13 @@ def build_base_tables(con, wide, t0):
ROUND(ST_X({geom}), 6) AS longitude,
mat.material AS material,
ctx.uri AS context,
obj.uri AS object_type
obj.uri AS object_type,
cl.concept_labels AS concept_labels
FROM read_parquet('{wide}') s
LEFT JOIN mat ON mat.pid = s.pid
LEFT JOIN ic AS ctx ON ctx.row_id = s.p__has_context_category[1]
LEFT JOIN ic AS obj ON obj.row_id = s.p__has_sample_object_type[1]
LEFT JOIN concept_labels cl ON cl.pid = s.pid
WHERE s.otype='MaterialSampleRecord';

CREATE OR REPLACE TEMP TABLE samp_geo AS
Expand All @@ -136,8 +190,19 @@ def build_base_tables(con, wide, t0):


def build_sample_facets_v2(con, out):
# description is SEARCH-ONLY in sample_facets_v2: the explorer reads
# description for display from the wide parquet (self-join on pid), never
# from facets_v2. We append the space-joined concept labels of every
# IdentifiedConcept referenced by this sample (p__has_material_category,
# p__has_sample_object_type, p__has_context_category, p__keywords) so that
# full-text searches like "pottery cyprus" match samples tagged with a pottery
# concept even when the word doesn't appear in label/description/place_name.
# The wide's IdentifiedConcept.label is used directly (covers minted keyword
# concepts such as British Museum thesaurus terms that are absent from
# vocab_labels.parquet). See issue #277 part 2.
con.execute(f"""COPY (
SELECT pid, source, material, context, object_type, label, description,
SELECT pid, source, material, context, object_type, label,
{FACETS_DESCRIPTION_EXPR} AS description,
place_name::VARCHAR AS place_name
FROM samp_geo ORDER BY pid
) TO '{out}' (FORMAT PARQUET, COMPRESSION ZSTD)""")
Expand Down Expand Up @@ -185,7 +250,7 @@ def build_h3_summary(con, out, res):

def build_facet_summaries(con, out):
union = " UNION ALL ".join(
f"SELECT '{d}' AS facet_type, {d} AS facet_value FROM samp_geo WHERE {d} IS NOT NULL"
f"SELECT '{d}' AS facet_type, {d} AS facet_value FROM samp_geo WHERE NULLIF(TRIM({d}), '') IS NOT NULL"
for d in FACET_DIMS)
con.execute(f"""COPY (
SELECT facet_type, facet_value, NULL::INTEGER AS scheme, COUNT(*) AS count
Expand All @@ -206,15 +271,15 @@ def build_facet_cross_filter(con, out):
f"SELECT NULL::VARCHAR AS filter_source, NULL::VARCHAR AS filter_material, "
f"NULL::VARCHAR AS filter_context, NULL::VARCHAR AS filter_object_type, "
f"'{fd}' AS facet_type, {fd} AS facet_value, COUNT(*) AS count "
f"FROM samp_geo WHERE {fd} IS NOT NULL GROUP BY {fd}")
f"FROM samp_geo WHERE NULLIF(TRIM({fd}), '') IS NOT NULL GROUP BY {fd}")
for filt in FACET_DIMS:
for fd in FACET_DIMS:
cols = ", ".join(
(f"{filt} AS filter_{c}" if c == filt else f"NULL::VARCHAR AS filter_{c}")
for c in FACET_DIMS)
selects.append(
f"SELECT {cols}, '{fd}' AS facet_type, {fd} AS facet_value, COUNT(*) AS count "
f"FROM samp_geo WHERE {filt} IS NOT NULL AND {fd} IS NOT NULL GROUP BY {filt}, {fd}")
f"FROM samp_geo WHERE NULLIF(TRIM({filt}), '') IS NOT NULL AND NULLIF(TRIM({fd}), '') IS NOT NULL GROUP BY {filt}, {fd}")
con.execute(f"""COPY (
SELECT filter_source, filter_material, filter_context, filter_object_type,
facet_type, facet_value, count
Expand Down
38 changes: 38 additions & 0 deletions scripts/build_vocab_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,27 @@

PREFERRED_LANG = "en"

# Deprecated / legacy concept URIs that are absent from the live SKOS TTLs but
# still appear in older source data (e.g. SESAR records using the specimentype/1.0
# namespace, superseded by materialsampleobjecttype/1.0). These rows are injected
# directly so the Explorer can display human-readable labels instead of raw URI
# path tails. Each entry: (uri, pref_label, lang, scheme).
# Issue #283b: 169 SESAR records carry these deprecated URIs.
MANUAL_LABEL_OVERRIDES: list[tuple[str, str, str, str | None]] = [
(
"https://w3id.org/isample/vocabulary/specimentype/1.0/othersolidobject",
"Other solid object",
"en",
"https://w3id.org/isample/vocabulary/specimentype/1.0/",
),
(
"https://w3id.org/isample/vocabulary/specimentype/1.0/physicalspecimen",
"Material sample",
"en",
"https://w3id.org/isample/vocabulary/specimentype/1.0/",
),
]

# When a concept URI is declared in more than one TTL, prefer the row whose
# source TTL's URL contains one of these path fragments. The fragments are
# matched against the concept URI: a URI containing "vocabulary/material/"
Expand Down Expand Up @@ -286,6 +307,23 @@ def main(argv: list[str] | None = None) -> int:
print("ERROR: no rows extracted; aborting.", file=sys.stderr)
return 2

# Inject manual overrides for deprecated URIs not present in any live TTL.
# These are appended before dedupe so _dedupe can merge them if they ever
# appear in a future TTL revision, and so _emit_data_form_aliases does NOT
# re-emit them (they already carry the /1.0/ version segment).
for uri, label, lang, scheme in MANUAL_LABEL_OVERRIDES:
all_rows.append({
"uri": uri,
"uri_form": "data_v1", # already in the /1.0/ data form
"pref_label": label,
"lang": lang,
"scheme": scheme,
"definition": None,
"alt_labels": [],
"source_ttl": "manual_override",
})
print(f" {len(MANUAL_LABEL_OVERRIDES):>4} rows (manual overrides for deprecated URIs)")

raw_count = len(all_rows)
all_rows = _dedupe(all_rows)
deduped_collapsed = raw_count - len(all_rows)
Expand Down
Loading
Loading