From a21ab6540c9ab74e4e42c9390ae2ef3029feb682 Mon Sep 17 00:00:00 2001 From: Jon Froehlich Date: Thu, 25 Jun 2026 08:27:24 -0700 Subject: [PATCH] Store original uploaded filename, admin-only, for talks/posters/pubs (#1391) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Artifact.save() renames uploaded files to the standardized Author_TitleInTitleCase_VenueYear scheme, destroying the human-recognizable upload name (e.g. MyTalk_v3_final.pptx). Capture that name into two new admin-only fields and surface it read-only on the change form, as a provenance breadcrumb for confirming/debugging uploads. - Artifact (abstract base): add nullable, editable=False original_pdf_filename and original_raw_filename. Capture block at the top of save() snapshots the basename ONLY on a genuine new upload — the first save, or an edit where the file field is in the incoming update_fields (before the rename logic appends to it). It deliberately does not fire on the m2m authors_changed rename re-save or on metadata-only edits, so a standardized name never clobbers a stored original. - ArtifactAdmin: read-only "Originally uploaded as" display injected into the Files fieldset on the change form (inherited by Talk/Poster/Publication). - backfill_original_filenames management command: production has many bulk-imported rows whose files were never renamed (their on-disk name still IS the original). Recover those: if a file's basename != generate_filename() (and isn't a "-" uniquified variant of it), record the current basename as the original. Idempotent (fills nulls only); wired into docker-entrypoint.sh so it runs at container start before any admin edit can rename a legacy row and lose its original name. - Tests: capture-survives-rename, edit-replace, metadata-only-no-clobber, backfill recover/skip/idempotent/uniquified, admin display. No committed migration (matches repo convention): settings_test builds from models and prod/test run makemigrations website at container start. Co-Authored-By: Claude Opus 4.8 (1M context) --- docker-entrypoint.sh | 5 + website/admin/artifact_admin.py | 55 +++++- .../commands/backfill_original_filenames.py | 131 +++++++++++++ website/models/artifact.py | 43 ++++- website/tests/test_artifact.py | 182 +++++++++++++++++- 5 files changed, 403 insertions(+), 13 deletions(-) create mode 100644 website/management/commands/backfill_original_filenames.py diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index e6aa2d0e..05d7ffeb 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -138,6 +138,11 @@ echo "4.7 Running 'python manage.py backfill_project_visibility' to resolve is_v echo "******************************************" python manage.py backfill_project_visibility +echo "****************** STEP 4.7b/5: docker-entrypoint.sh ************************" +echo "4.7b Running 'python manage.py backfill_original_filenames' to recover original upload filenames for never-renamed artifacts (#1391)" +echo "******************************************" +python manage.py backfill_original_filenames + echo "****************** STEP 4.8/5: docker-entrypoint.sh ************************" echo "4.8 Running 'python manage.py recompute_url_names' to de-collide historical url_names (#1206)" echo "******************************************" diff --git a/website/admin/artifact_admin.py b/website/admin/artifact_admin.py index e8ba06ce..b9fb12f6 100644 --- a/website/admin/artifact_admin.py +++ b/website/admin/artifact_admin.py @@ -1,7 +1,7 @@ from django.contrib import admin from website.models import Artifact from django.contrib.admin import widgets -from django.utils.html import format_html +from django.utils.html import format_html, format_html_join from sortedm2m_filter_horizontal_widget.forms import SortedFilteredSelectMultiple from website.utils.upload_validators import PDF_EXTENSIONS, RAW_FILE_EXTENSIONS from easy_thumbnails.files import get_thumbnailer @@ -41,9 +41,10 @@ class Media: # (Django auto-applies DISTINCT for the M2M join). Subclasses may extend this. search_fields = ['title', 'forum_name', 'authors__first_name', 'authors__last_name'] - # thumbnail_preview is a computed, read-only display (see below). It must be - # listed here so Django allows it in get_fieldsets() on the change form. - readonly_fields = ('thumbnail_preview',) + # thumbnail_preview and original_upload_filenames are computed, read-only + # displays (see below). They must be listed here so Django allows them in + # get_fieldsets() on the change form. + readonly_fields = ('thumbnail_preview', 'original_upload_filenames') fieldsets = [ (None, {'fields': ['title', 'authors', 'date']}), @@ -97,12 +98,45 @@ def thumbnail_preview(self, obj): # Django auto-appends the trailing colon in the admin label. thumbnail_preview.short_description = 'PDF thumbnail' + def original_upload_filenames(self, obj): + """ + Read-only provenance breadcrumb showing the human-recognizable name(s) + the file(s) had when uploaded (e.g. "MyTalk_v3_final.pptx"), before + ``Artifact.save()`` renamed them to the standardized scheme. Admin-only + — sourced from the ``original_pdf_filename`` / ``original_raw_filename`` + model fields (issue #1391). + + Shows a muted placeholder when neither is recorded: historical rows + whose file was already renamed can't be recovered (the original name is + gone), and PDF-less / raw-file-less artifacts simply have nothing to + show. + """ + if obj is None: + return format_html('') + rows = [] + if obj.original_pdf_filename: + rows.append(('PDF', obj.original_pdf_filename)) + if obj.original_raw_filename: + rows.append(('Raw file', obj.original_raw_filename)) + if not rows: + return format_html( + 'Not recorded — uploaded before this ' + 'was tracked, or no file attached.' + ) + return format_html_join( + '', '
{}: {}
', rows + ) + + # Django auto-appends the trailing colon in the admin label. + original_upload_filenames.short_description = 'Originally uploaded as' + def get_fieldsets(self, request, obj=None): """ - Inject the read-only ``thumbnail_preview`` into the 'Files' fieldset on - the change form only. Done here (rather than in each child admin's - ``fieldsets``) so Publication / Talk / Poster all get the preview. - On the Add form there is no saved thumbnail yet, so it is omitted. + Inject the read-only ``thumbnail_preview`` and ``original_upload_filenames`` + displays into the 'Files' fieldset on the change form only. Done here + (rather than in each child admin's ``fieldsets``) so Publication / Talk / + Poster all get them. On the Add form there is no saved thumbnail or + captured upload name yet, so both are omitted. """ fieldsets = super().get_fieldsets(request, obj) if obj is None: @@ -113,8 +147,9 @@ def get_fieldsets(self, request, obj=None): for name, opts in fieldsets: if name == 'Files': fields = list(opts.get('fields', [])) - if 'thumbnail_preview' not in fields: - fields = fields + ['thumbnail_preview'] + for extra in ('thumbnail_preview', 'original_upload_filenames'): + if extra not in fields: + fields = fields + [extra] opts = {**opts, 'fields': fields} updated.append((name, opts)) return updated diff --git a/website/management/commands/backfill_original_filenames.py b/website/management/commands/backfill_original_filenames.py new file mode 100644 index 00000000..1ef875e9 --- /dev/null +++ b/website/management/commands/backfill_original_filenames.py @@ -0,0 +1,131 @@ +import os +import logging + +from django.core.management.base import BaseCommand + +from website.models import Artifact, Talk, Poster, Publication + +# This retrieves a Python logging instance (or creates it) +_logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = ( + "Backfills Artifact.original_pdf_filename / original_raw_filename for " + "existing talks, posters, and publications whose file was never renamed " + "to the standardized scheme (issue #1391). Production has many such rows " + "predating the auto-rename feature, so their on-disk filename still IS " + "the original upload name and can be recovered. Rows whose file already " + "matches the standardized scheme have lost their original name and are " + "left blank. Only fills empty values, so it never overwrites a name " + "already captured at upload time. Idempotent: safe to run on every " + "container start." + ) + + # The concrete artifact models whose files this backfill covers. + MODELS = (Talk, Poster, Publication) + + # (FileField attr, original-name field attr) pairs to backfill. + FILE_FIELDS = ( + ('pdf_file', 'original_pdf_filename'), + ('raw_file', 'original_raw_filename'), + ) + + def add_arguments(self, parser): + parser.add_argument( + "--dry-run", + action="store_true", + help="Report what would change without writing to the database.", + ) + + def handle(self, *args, **options): + dry_run = options["dry_run"] + _logger.debug( + f"Running backfill_original_filenames.py (dry_run={dry_run}) to " + f"recover original upload filenames for never-renamed artifacts." + ) + + total_updated = 0 + total_skipped = 0 + for model in self.MODELS: + updated, skipped = self._backfill_model(model, dry_run) + total_updated += updated + total_skipped += skipped + + verb = "Would update" if dry_run else "Updated" + _logger.info( + f"backfill_original_filenames: {verb} {total_updated} filename " + f"field(s); skipped {total_skipped} (already standardized — original " + f"name unrecoverable)." + ) + _logger.debug("Completed backfill_original_filenames.py") + + def _backfill_model(self, model, dry_run): + """Backfill both file fields for one concrete artifact model. + + Returns a ``(num_updated, num_skipped)`` tuple counting individual + filename fields touched / deliberately left blank. + """ + num_updated = 0 + num_skipped = 0 + for file_attr, original_attr in self.FILE_FIELDS: + # Only rows that have a file but no captured original name yet. + candidates = ( + model.objects.filter(**{f"{original_attr}__isnull": True}) + .exclude(**{file_attr: ""}) + .exclude(**{f"{file_attr}__isnull": True}) + ) + # generate_filename() reads the first author's last name, so prefetch + # authors to avoid a per-row query. + candidates = candidates.prefetch_related("authors") + + for artifact in candidates: + file_field = getattr(artifact, file_attr) + if not file_field: + continue + + current_basename = os.path.basename(file_field.name) + current_no_ext = os.path.splitext(current_basename)[0] + standardized_no_ext = Artifact.generate_filename(artifact) + + # Treat the file as already-standardized when its name equals the + # standardized scheme OR is a uniquified variant of it. When a + # standardized name collides on disk, ensure_filename_is_unique() + # (fileutils.py) appends "-" — e.g. + # "Lee_Talk_CHI2021-1782399772.42.pdf" — so the on-disk name + # still STARTS WITH the standardized base. Matching only on exact + # equality would misread those as never-renamed and record the + # standardized+suffix name as the "original" — a false positive. + already_standardized = ( + current_no_ext == standardized_no_ext + or current_no_ext.startswith(standardized_no_ext + "-") + ) + if already_standardized: + # Already renamed — the original upload name is gone. + _logger.debug( + f"Skipping {model.__name__} id={artifact.pk} {file_attr}=" + f"'{current_basename}': already standardized." + ) + num_skipped += 1 + continue + + # Never renamed: the current on-disk name is the original. + if dry_run: + _logger.debug( + f"[dry-run] Would set {original_attr}='{current_basename}' " + f"for {model.__name__} id={artifact.pk} '{artifact.title}'" + ) + else: + # Write directly via the queryset so this stays a pure data + # backfill — no file-rename / thumbnail side effects from the + # model's save(). + model.objects.filter(pk=artifact.pk).update( + **{original_attr: current_basename} + ) + _logger.debug( + f"Set {original_attr}='{current_basename}' for " + f"{model.__name__} id={artifact.pk} '{artifact.title}'" + ) + num_updated += 1 + + return num_updated, num_skipped diff --git a/website/models/artifact.py b/website/models/artifact.py index 3eacd138..22913720 100644 --- a/website/models/artifact.py +++ b/website/models/artifact.py @@ -41,7 +41,18 @@ class Artifact(models.Model): raw_file.help_text = "The raw file (e.g., pptx, keynote) for the artifact. While not required, this is "\ "highly recommended as it creates a better archive of the work" thumbnail = models.ImageField(upload_to=get_upload_thumbnail_dir, editable=False, null=True, max_length=255) - + + # Provenance: the human-recognizable name of the file as it was uploaded + # (e.g., "MyTalk_v3_final.pptx"), before save() renames it to the + # standardized Author_Title_VenueYear scheme. Admin-only (editable=False so + # it never appears on the public-facing form; surfaced read-only on the + # admin change form). Captured only on a genuine new upload — see save(). + # Existing rows whose file was already renamed can't be recovered and stay + # null (the backfill_original_filenames command fills the never-renamed + # ones whose on-disk name is still the original). See issue #1391. + original_pdf_filename = models.CharField(max_length=255, blank=True, null=True, editable=False) + original_raw_filename = models.CharField(max_length=255, blank=True, null=True, editable=False) + # Project and keyword associations projects = models.ManyToManyField('Project', blank=True) projects.help_text = "Most artifacts are associated with only one project but "\ @@ -233,7 +244,35 @@ def save(self, *args, **kwargs): first_time_saved = self.id is None _logger.debug(f"For artifact.id={self.id}, first_time_saved={first_time_saved}") - + + # --- #1391: snapshot the original uploaded filename(s) --- + # The rename logic further down destroys the human-recognizable upload + # name (e.g. "MyTalk_v3_final.pptx"). We capture it here, but ONLY on a + # genuine new upload — never on a later edit or the m2m-triggered rename + # pass, where the file already carries the standardized name. A new + # upload is detectable two ways: + # 1. the first save of this artifact (the file is the just-uploaded + # one), or + # 2. an edit where the form reported the file field as changed, i.e. + # it is in the incoming update_fields BEFORE the rename block below + # appends to that list. + # On an edit we must also add the original_* field to update_fields so + # it persists (the first save writes all fields anyway). + incoming_update_fields = kwargs.get('update_fields') + for file_attr, original_attr in (('pdf_file', 'original_pdf_filename'), + ('raw_file', 'original_raw_filename')): + file_field = getattr(self, file_attr) + if not file_field: + continue + is_new_upload = first_time_saved or ( + incoming_update_fields is not None and file_attr in incoming_update_fields + ) + if is_new_upload: + setattr(self, original_attr, os.path.basename(file_field.name)) + _logger.debug(f"Captured {original_attr}={getattr(self, original_attr)} for artifact.id={self.id}") + if not first_time_saved: + kwargs.setdefault('update_fields', []).append(original_attr) + # Note that "update_fields" is custom filled by our save_model in ArtifactAdmin # It will never contain the m2m fields (e.g., authors, keywords, etc.) due to # how Django handles m2m fields. Instead, you can hook up an m2m_changed signal diff --git a/website/tests/test_artifact.py b/website/tests/test_artifact.py index 7b2b2ef2..dac88ea5 100644 --- a/website/tests/test_artifact.py +++ b/website/tests/test_artifact.py @@ -1,12 +1,24 @@ """Tests for Artifact model methods (filename-drift check, raw-file label).""" +import os from datetime import date from unittest.mock import MagicMock, patch +from django.core.files.uploadedfile import SimpleUploadedFile +from django.core.management import call_command from django.test import SimpleTestCase -from website.models import Publication +from website.models import Artifact, Publication, Talk from website.tests.base import DatabaseTestCase +from website.tests.factories import TalkFactory + + +def _pdf(name): + return SimpleUploadedFile(name, b"%PDF-1.4 test", content_type="application/pdf") + + +def _raw(name): + return SimpleUploadedFile(name, b"PKstub", content_type="application/octet-stream") # --- Artifact filename check regression ----------------------------------- @@ -143,3 +155,171 @@ def test_resaving_artifact_without_pdf_does_not_crash(self): pub.location = "Seattle, WA" pub.save() # must not raise self.assertFalse(bool(Publication.objects.get(pk=pub.pk).pdf_file)) + + +# --- #1391: original uploaded filename capture ---------------------------- + + +class OriginalFilenameCaptureTests(DatabaseTestCase): + """ + Artifact.save() captures the human-recognizable upload name into + ``original_pdf_filename`` / ``original_raw_filename`` before the + standardized rename destroys it, but only on a *genuine new upload* + (issue #1391). + """ + + def test_new_upload_captures_original_and_survives_rename(self): + """ + On the initial upload the original names are snapshotted, and they + survive the ``authors_changed`` rename pass that renames the files on + disk to the standardized Author_Title_VenueYear scheme. + """ + person = self.make_person(last_name="Zhang") + talk = TalkFactory( + title="My Cool Talk", + forum_name="CHI", + date=date(2024, 1, 1), + pdf_file=_pdf("MyTalk_v3_final.pdf"), + raw_file=_raw("MyTalk_v3_final.pptx"), + authors=[person], + ) + talk.refresh_from_db() + + # The original upload names are preserved... + self.assertEqual(talk.original_pdf_filename, "MyTalk_v3_final.pdf") + self.assertEqual(talk.original_raw_filename, "MyTalk_v3_final.pptx") + # ...while the files on disk were renamed to the standardized scheme. + self.assertIn("Zhang", os.path.basename(talk.pdf_file.name)) + self.assertNotIn("MyTalk_v3_final", os.path.basename(talk.pdf_file.name)) + + def test_replacing_file_on_edit_updates_original(self): + """ + Uploading a replacement file through an edit (the admin passes the + changed field in ``update_fields``) re-captures the new upload name. + """ + person = self.make_person(last_name="Zhang") + talk = TalkFactory( + title="My Cool Talk", forum_name="CHI", date=date(2024, 1, 1), + pdf_file=_pdf("MyTalk_v3_final.pdf"), authors=[person], + ) + + talk.pdf_file = _pdf("Revised_v2_FINAL.pdf") + talk.save(update_fields=["pdf_file"]) + talk.refresh_from_db() + + self.assertEqual(talk.original_pdf_filename, "Revised_v2_FINAL.pdf") + + def test_metadata_only_edit_does_not_clobber_original(self): + """ + A later edit that does not touch the file (only metadata) must NOT + overwrite the stored original with the now-standardized filename. + """ + person = self.make_person(last_name="Zhang") + talk = TalkFactory( + title="My Cool Talk", forum_name="CHI", date=date(2024, 1, 1), + pdf_file=_pdf("MyTalk_v3_final.pdf"), authors=[person], + ) + + talk.location = "Honolulu, HI" + talk.save(update_fields=["location"]) + talk.refresh_from_db() + + self.assertEqual(talk.original_pdf_filename, "MyTalk_v3_final.pdf") + + +class BackfillOriginalFilenamesTests(DatabaseTestCase): + """ + The backfill_original_filenames command recovers the original upload name + for never-renamed artifacts (whose on-disk filename still IS the original), + leaves already-standardized rows blank, and never overwrites a value that + is already set (issue #1391). + """ + + def test_backfills_never_renamed_leaves_standardized_and_is_idempotent(self): + # (a) Legacy never-renamed talk: created without authors, so save() + # leaves the upload filename untouched. Then null the captured original + # to simulate a row predating this feature. + legacy = TalkFactory( + title="Legacy Talk", forum_name="UIST", date=date(2020, 1, 1), + pdf_file=_pdf("OldUpload_final.pdf"), + ) + legacy_name = os.path.basename(legacy.pdf_file.name) + Talk.objects.filter(pk=legacy.pk).update(original_pdf_filename=None) + + # (b) Standardized talk: created with an author, so save() renamed the + # file to the Author_Title_VenueYear scheme. Null its original too. + standard = TalkFactory( + title="Standard Talk", forum_name="CHI", date=date(2021, 1, 1), + pdf_file=_pdf("whatever_upload.pdf"), + authors=[self.make_person(last_name="Lee")], + ) + Talk.objects.filter(pk=standard.pk).update(original_pdf_filename=None) + + call_command("backfill_original_filenames") + + legacy.refresh_from_db() + standard.refresh_from_db() + # Never-renamed file: its current basename is recorded as the original. + self.assertEqual(legacy.original_pdf_filename, legacy_name) + # Already-standardized file: original is unrecoverable, left blank. + self.assertIsNone(standard.original_pdf_filename) + + # Idempotent: a value already set is never overwritten (only nulls fill). + Talk.objects.filter(pk=legacy.pk).update( + original_pdf_filename="ManuallyCorrected.pdf" + ) + call_command("backfill_original_filenames") + legacy.refresh_from_db() + self.assertEqual(legacy.original_pdf_filename, "ManuallyCorrected.pdf") + + def test_uniquified_standardized_name_is_not_backfilled(self): + """ + When a standardized filename collided on disk, the rename appended a + "-" suffix (ensure_filename_is_unique). Such a name is still + a renamed file, NOT an original upload, so the backfill must leave it + blank rather than recording the standardized+suffix name. + """ + person = self.make_person(last_name="Park") + talk = TalkFactory( + title="Unique Talk", forum_name="CHI", date=date(2022, 1, 1), + pdf_file=_pdf("anything.pdf"), authors=[person], + ) + standardized = Artifact.generate_filename(talk) + # Simulate the uniquified on-disk name and a not-yet-backfilled row. + uniquified = f"talks/{standardized}-1782399772.42.pdf" + Talk.objects.filter(pk=talk.pk).update( + pdf_file=uniquified, original_pdf_filename=None + ) + + call_command("backfill_original_filenames") + + talk.refresh_from_db() + self.assertIsNone(talk.original_pdf_filename) + + +class OriginalUploadFilenamesDisplayTests(SimpleTestCase): + """ArtifactAdmin.original_upload_filenames read-only display (issue #1391).""" + + def _render(self, pdf=None, raw=None): + from website.admin.artifact_admin import ArtifactAdmin + obj = MagicMock() + obj.original_pdf_filename = pdf + obj.original_raw_filename = raw + # The method does not use self; pass None. + return str(ArtifactAdmin.original_upload_filenames(None, obj)) + + def test_shows_both_filenames(self): + html = self._render(pdf="MyTalk_v3.pdf", raw="MyTalk_v3.pptx") + self.assertIn("PDF", html) + self.assertIn("MyTalk_v3.pdf", html) + self.assertIn("Raw file", html) + self.assertIn("MyTalk_v3.pptx", html) + + def test_pdf_only_omits_raw_row(self): + html = self._render(pdf="MyTalk_v3.pdf", raw=None) + self.assertIn("MyTalk_v3.pdf", html) + self.assertNotIn("Raw file", html) + + def test_placeholder_when_nothing_recorded(self): + html = self._render(pdf=None, raw=None) + self.assertIn("Not recorded", html)