Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 84 additions & 51 deletions website/management/commands/backfill_original_filenames.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,27 +47,39 @@ def handle(self, *args, **options):

total_updated = 0
total_skipped = 0
total_errors = 0
for model in self.MODELS:
updated, skipped = self._backfill_model(model, dry_run)
updated, skipped, errors = self._backfill_model(model, dry_run)
total_updated += updated
total_skipped += skipped
total_errors += errors

verb = "Would update" if dry_run else "Updated"
_logger.info(
f"backfill_original_filenames: {verb} {total_updated} filename "
f"field(s); skipped {total_skipped} (already standardized — original "
f"name unrecoverable)."
f"name unrecoverable); {total_errors} row(s) errored and were skipped."
)
_logger.debug("Completed backfill_original_filenames.py")

def _backfill_model(self, model, dry_run):
"""Backfill both file fields for one concrete artifact model.

Returns a ``(num_updated, num_skipped)`` tuple counting individual
filename fields touched / deliberately left blank.
Returns a ``(num_updated, num_skipped, num_errors)`` tuple counting
individual filename fields touched / deliberately left blank / skipped
because the row raised.

Each row is processed inside its own try/except: this iterates the
whole production dataset, and a single malformed row (e.g. a null
``date`` or ``title`` makes ``generate_filename`` raise) must not abort
the entire backfill and leave every later row untouched. The entrypoint
has no ``set -e``, so an aborted run would fail silently (the traceback
prints but startup continues) — this per-row isolation is defensive
insurance against that.
"""
num_updated = 0
num_skipped = 0
num_errors = 0
for file_attr, original_attr in self.FILE_FIELDS:
# Only rows that have a file but no captured original name yet.
candidates = (
Expand All @@ -80,52 +92,73 @@ def _backfill_model(self, model, dry_run):
candidates = candidates.prefetch_related("authors")

for artifact in candidates:
file_field = getattr(artifact, file_attr)
if not file_field:
continue

current_basename = os.path.basename(file_field.name)
current_no_ext = os.path.splitext(current_basename)[0]
standardized_no_ext = Artifact.generate_filename(artifact)

# Treat the file as already-standardized when its name equals the
# standardized scheme OR is a uniquified variant of it. When a
# standardized name collides on disk, ensure_filename_is_unique()
# (fileutils.py) appends "-<timestamp>" — e.g.
# "Lee_Talk_CHI2021-1782399772.42.pdf" — so the on-disk name
# still STARTS WITH the standardized base. Matching only on exact
# equality would misread those as never-renamed and record the
# standardized+suffix name as the "original" — a false positive.
already_standardized = (
current_no_ext == standardized_no_ext
or current_no_ext.startswith(standardized_no_ext + "-")
)
if already_standardized:
# Already renamed — the original upload name is gone.
_logger.debug(
f"Skipping {model.__name__} id={artifact.pk} {file_attr}="
f"'{current_basename}': already standardized."
try:
if self._backfill_row(model, artifact, file_attr,
original_attr, dry_run):
num_updated += 1
else:
num_skipped += 1
except Exception:
# Log and move on — never let one row kill the batch.
_logger.exception(
"backfill_original_filenames: skipping %s id=%s %s due "
"to an error", model.__name__,
getattr(artifact, "pk", "?"), file_attr,
)
num_skipped += 1
continue

# Never renamed: the current on-disk name is the original.
if dry_run:
_logger.debug(
f"[dry-run] Would set {original_attr}='{current_basename}' "
f"for {model.__name__} id={artifact.pk} '{artifact.title}'"
)
else:
# Write directly via the queryset so this stays a pure data
# backfill — no file-rename / thumbnail side effects from the
# model's save().
model.objects.filter(pk=artifact.pk).update(
**{original_attr: current_basename}
)
_logger.debug(
f"Set {original_attr}='{current_basename}' for "
f"{model.__name__} id={artifact.pk} '{artifact.title}'"
)
num_updated += 1
num_errors += 1

return num_updated, num_skipped, num_errors

def _backfill_row(self, model, artifact, file_attr, original_attr, dry_run):
"""Backfill one (artifact, file field) pair.

Returns True if the original name was (or would be) recorded, False if
the row was deliberately skipped because its file is already
standardized. Raises on malformed data — the caller isolates that.
"""
file_field = getattr(artifact, file_attr)
if not file_field:
return False

current_basename = os.path.basename(file_field.name)
current_no_ext = os.path.splitext(current_basename)[0]
standardized_no_ext = Artifact.generate_filename(artifact)

# Treat the file as already-standardized when its name equals the
# standardized scheme OR is a uniquified variant of it. When a
# standardized name collides on disk, ensure_filename_is_unique()
# (fileutils.py) appends "-<timestamp>" — e.g.
# "Lee_Talk_CHI2021-1782399772.42.pdf" — so the on-disk name still
# STARTS WITH the standardized base. Matching only on exact equality
# would misread those as never-renamed and record the standardized+
# suffix name as the "original" — a false positive.
already_standardized = (
current_no_ext == standardized_no_ext
or current_no_ext.startswith(standardized_no_ext + "-")
)
if already_standardized:
# Already renamed — the original upload name is gone.
_logger.debug(
f"Skipping {model.__name__} id={artifact.pk} {file_attr}="
f"'{current_basename}': already standardized."
)
return False

return num_updated, num_skipped
# Never renamed: the current on-disk name is the original.
if dry_run:
_logger.debug(
f"[dry-run] Would set {original_attr}='{current_basename}' "
f"for {model.__name__} id={artifact.pk} '{artifact.title}'"
)
else:
# Write directly via the queryset so this stays a pure data
# backfill — no file-rename / thumbnail side effects from the
# model's save().
model.objects.filter(pk=artifact.pk).update(
**{original_attr: current_basename}
)
_logger.debug(
f"Set {original_attr}='{current_basename}' for "
f"{model.__name__} id={artifact.pk} '{artifact.title}'"
)
return True
34 changes: 34 additions & 0 deletions website/tests/test_artifact.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,40 @@ def test_uniquified_standardized_name_is_not_backfilled(self):
talk.refresh_from_db()
self.assertIsNone(talk.original_pdf_filename)

def test_one_bad_row_does_not_abort_the_batch(self):
"""
The backfill runs on every container start over the whole dataset, so a
single malformed row must not abort the run and leave every other row
untouched. A null ``date`` makes ``generate_filename`` raise
(``date.year``); before per-row isolation that exception propagated out
of the command. Here the bad row is skipped and a good legacy row is
still backfilled.
"""
# Malformed: no authors (so save() skips the rename and accepts the
# null date) + a file, which makes it a backfill candidate that raises.
bad = TalkFactory(
title="Bad Row", forum_name="CHI", date=None,
pdf_file=_pdf("bad_upload.pdf"),
)
# A good legacy candidate with a non-standard (never-renamed) name.
good = TalkFactory(
title="Good Row", forum_name="CHI", date=date(2019, 1, 1),
pdf_file=_pdf("good_upload_final.pdf"),
)
good_name = os.path.basename(good.pdf_file.name)
Talk.objects.filter(pk__in=[bad.pk, good.pk]).update(
original_pdf_filename=None
)

# Must not raise despite the malformed row...
call_command("backfill_original_filenames")

good.refresh_from_db()
bad.refresh_from_db()
# ...and the good row is still processed.
self.assertEqual(good.original_pdf_filename, good_name)
self.assertIsNone(bad.original_pdf_filename)


class OriginalUploadFilenamesDisplayTests(SimpleTestCase):
"""ArtifactAdmin.original_upload_filenames read-only display (issue #1391)."""
Expand Down
Loading