diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh
index e6aa2d0e..05d7ffeb 100755
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@@ -138,6 +138,11 @@ echo "4.7 Running 'python manage.py backfill_project_visibility' to resolve is_v
echo "******************************************"
python manage.py backfill_project_visibility
+echo "****************** STEP 4.7b/5: docker-entrypoint.sh ************************"
+echo "4.7b Running 'python manage.py backfill_original_filenames' to recover original upload filenames for never-renamed artifacts (#1391)"
+echo "******************************************"
+python manage.py backfill_original_filenames
+
echo "****************** STEP 4.8/5: docker-entrypoint.sh ************************"
echo "4.8 Running 'python manage.py recompute_url_names' to de-collide historical url_names (#1206)"
echo "******************************************"
diff --git a/website/admin/artifact_admin.py b/website/admin/artifact_admin.py
index e8ba06ce..b9fb12f6 100644
--- a/website/admin/artifact_admin.py
+++ b/website/admin/artifact_admin.py
@@ -1,7 +1,7 @@
from django.contrib import admin
from website.models import Artifact
from django.contrib.admin import widgets
-from django.utils.html import format_html
+from django.utils.html import format_html, format_html_join
from sortedm2m_filter_horizontal_widget.forms import SortedFilteredSelectMultiple
from website.utils.upload_validators import PDF_EXTENSIONS, RAW_FILE_EXTENSIONS
from easy_thumbnails.files import get_thumbnailer
@@ -41,9 +41,10 @@ class Media:
# (Django auto-applies DISTINCT for the M2M join). Subclasses may extend this.
search_fields = ['title', 'forum_name', 'authors__first_name', 'authors__last_name']
- # thumbnail_preview is a computed, read-only display (see below). It must be
- # listed here so Django allows it in get_fieldsets() on the change form.
- readonly_fields = ('thumbnail_preview',)
+ # thumbnail_preview and original_upload_filenames are computed, read-only
+ # displays (see below). They must be listed here so Django allows them in
+ # get_fieldsets() on the change form.
+ readonly_fields = ('thumbnail_preview', 'original_upload_filenames')
fieldsets = [
(None, {'fields': ['title', 'authors', 'date']}),
@@ -97,12 +98,45 @@ def thumbnail_preview(self, obj):
# Django auto-appends the trailing colon in the admin label.
thumbnail_preview.short_description = 'PDF thumbnail'
+ def original_upload_filenames(self, obj):
+ """
+ Read-only provenance breadcrumb showing the human-recognizable name(s)
+ the file(s) had when uploaded (e.g. "MyTalk_v3_final.pptx"), before
+ ``Artifact.save()`` renamed them to the standardized scheme. Admin-only
+ — sourced from the ``original_pdf_filename`` / ``original_raw_filename``
+ model fields (issue #1391).
+
+ Shows a muted placeholder when neither is recorded: historical rows
+ whose file was already renamed can't be recovered (the original name is
+ gone), and PDF-less / raw-file-less artifacts simply have nothing to
+ show.
+ """
+ if obj is None:
+ return format_html('—')
+ rows = []
+ if obj.original_pdf_filename:
+ rows.append(('PDF', obj.original_pdf_filename))
+ if obj.original_raw_filename:
+ rows.append(('Raw file', obj.original_raw_filename))
+ if not rows:
+ return format_html(
+ 'Not recorded — uploaded before this '
+ 'was tracked, or no file attached.'
+ )
+ return format_html_join(
+ '', '
{}: {}
', rows
+ )
+
+ # Django auto-appends the trailing colon in the admin label.
+ original_upload_filenames.short_description = 'Originally uploaded as'
+
def get_fieldsets(self, request, obj=None):
"""
- Inject the read-only ``thumbnail_preview`` into the 'Files' fieldset on
- the change form only. Done here (rather than in each child admin's
- ``fieldsets``) so Publication / Talk / Poster all get the preview.
- On the Add form there is no saved thumbnail yet, so it is omitted.
+ Inject the read-only ``thumbnail_preview`` and ``original_upload_filenames``
+ displays into the 'Files' fieldset on the change form only. Done here
+ (rather than in each child admin's ``fieldsets``) so Publication / Talk /
+ Poster all get them. On the Add form there is no saved thumbnail or
+ captured upload name yet, so both are omitted.
"""
fieldsets = super().get_fieldsets(request, obj)
if obj is None:
@@ -113,8 +147,9 @@ def get_fieldsets(self, request, obj=None):
for name, opts in fieldsets:
if name == 'Files':
fields = list(opts.get('fields', []))
- if 'thumbnail_preview' not in fields:
- fields = fields + ['thumbnail_preview']
+ for extra in ('thumbnail_preview', 'original_upload_filenames'):
+ if extra not in fields:
+ fields = fields + [extra]
opts = {**opts, 'fields': fields}
updated.append((name, opts))
return updated
diff --git a/website/management/commands/backfill_original_filenames.py b/website/management/commands/backfill_original_filenames.py
new file mode 100644
index 00000000..1ef875e9
--- /dev/null
+++ b/website/management/commands/backfill_original_filenames.py
@@ -0,0 +1,131 @@
+import os
+import logging
+
+from django.core.management.base import BaseCommand
+
+from website.models import Artifact, Talk, Poster, Publication
+
+# This retrieves a Python logging instance (or creates it)
+_logger = logging.getLogger(__name__)
+
+
+class Command(BaseCommand):
+ help = (
+ "Backfills Artifact.original_pdf_filename / original_raw_filename for "
+ "existing talks, posters, and publications whose file was never renamed "
+ "to the standardized scheme (issue #1391). Production has many such rows "
+ "predating the auto-rename feature, so their on-disk filename still IS "
+ "the original upload name and can be recovered. Rows whose file already "
+ "matches the standardized scheme have lost their original name and are "
+ "left blank. Only fills empty values, so it never overwrites a name "
+ "already captured at upload time. Idempotent: safe to run on every "
+ "container start."
+ )
+
+ # The concrete artifact models whose files this backfill covers.
+ MODELS = (Talk, Poster, Publication)
+
+ # (FileField attr, original-name field attr) pairs to backfill.
+ FILE_FIELDS = (
+ ('pdf_file', 'original_pdf_filename'),
+ ('raw_file', 'original_raw_filename'),
+ )
+
+ def add_arguments(self, parser):
+ parser.add_argument(
+ "--dry-run",
+ action="store_true",
+ help="Report what would change without writing to the database.",
+ )
+
+ def handle(self, *args, **options):
+ dry_run = options["dry_run"]
+ _logger.debug(
+ f"Running backfill_original_filenames.py (dry_run={dry_run}) to "
+ f"recover original upload filenames for never-renamed artifacts."
+ )
+
+ total_updated = 0
+ total_skipped = 0
+ for model in self.MODELS:
+ updated, skipped = self._backfill_model(model, dry_run)
+ total_updated += updated
+ total_skipped += skipped
+
+ verb = "Would update" if dry_run else "Updated"
+ _logger.info(
+ f"backfill_original_filenames: {verb} {total_updated} filename "
+ f"field(s); skipped {total_skipped} (already standardized — original "
+ f"name unrecoverable)."
+ )
+ _logger.debug("Completed backfill_original_filenames.py")
+
+ def _backfill_model(self, model, dry_run):
+ """Backfill both file fields for one concrete artifact model.
+
+ Returns a ``(num_updated, num_skipped)`` tuple counting individual
+ filename fields touched / deliberately left blank.
+ """
+ num_updated = 0
+ num_skipped = 0
+ for file_attr, original_attr in self.FILE_FIELDS:
+ # Only rows that have a file but no captured original name yet.
+ candidates = (
+ model.objects.filter(**{f"{original_attr}__isnull": True})
+ .exclude(**{file_attr: ""})
+ .exclude(**{f"{file_attr}__isnull": True})
+ )
+ # generate_filename() reads the first author's last name, so prefetch
+ # authors to avoid a per-row query.
+ candidates = candidates.prefetch_related("authors")
+
+ for artifact in candidates:
+ file_field = getattr(artifact, file_attr)
+ if not file_field:
+ continue
+
+ current_basename = os.path.basename(file_field.name)
+ current_no_ext = os.path.splitext(current_basename)[0]
+ standardized_no_ext = Artifact.generate_filename(artifact)
+
+ # Treat the file as already-standardized when its name equals the
+ # standardized scheme OR is a uniquified variant of it. When a
+ # standardized name collides on disk, ensure_filename_is_unique()
+ # (fileutils.py) appends "-" — e.g.
+ # "Lee_Talk_CHI2021-1782399772.42.pdf" — so the on-disk name
+ # still STARTS WITH the standardized base. Matching only on exact
+ # equality would misread those as never-renamed and record the
+ # standardized+suffix name as the "original" — a false positive.
+ already_standardized = (
+ current_no_ext == standardized_no_ext
+ or current_no_ext.startswith(standardized_no_ext + "-")
+ )
+ if already_standardized:
+ # Already renamed — the original upload name is gone.
+ _logger.debug(
+ f"Skipping {model.__name__} id={artifact.pk} {file_attr}="
+ f"'{current_basename}': already standardized."
+ )
+ num_skipped += 1
+ continue
+
+ # Never renamed: the current on-disk name is the original.
+ if dry_run:
+ _logger.debug(
+ f"[dry-run] Would set {original_attr}='{current_basename}' "
+ f"for {model.__name__} id={artifact.pk} '{artifact.title}'"
+ )
+ else:
+ # Write directly via the queryset so this stays a pure data
+ # backfill — no file-rename / thumbnail side effects from the
+ # model's save().
+ model.objects.filter(pk=artifact.pk).update(
+ **{original_attr: current_basename}
+ )
+ _logger.debug(
+ f"Set {original_attr}='{current_basename}' for "
+ f"{model.__name__} id={artifact.pk} '{artifact.title}'"
+ )
+ num_updated += 1
+
+ return num_updated, num_skipped
diff --git a/website/models/artifact.py b/website/models/artifact.py
index 3eacd138..22913720 100644
--- a/website/models/artifact.py
+++ b/website/models/artifact.py
@@ -41,7 +41,18 @@ class Artifact(models.Model):
raw_file.help_text = "The raw file (e.g., pptx, keynote) for the artifact. While not required, this is "\
"highly recommended as it creates a better archive of the work"
thumbnail = models.ImageField(upload_to=get_upload_thumbnail_dir, editable=False, null=True, max_length=255)
-
+
+ # Provenance: the human-recognizable name of the file as it was uploaded
+ # (e.g., "MyTalk_v3_final.pptx"), before save() renames it to the
+ # standardized Author_Title_VenueYear scheme. Admin-only (editable=False so
+ # it never appears on the public-facing form; surfaced read-only on the
+ # admin change form). Captured only on a genuine new upload — see save().
+ # Existing rows whose file was already renamed can't be recovered and stay
+ # null (the backfill_original_filenames command fills the never-renamed
+ # ones whose on-disk name is still the original). See issue #1391.
+ original_pdf_filename = models.CharField(max_length=255, blank=True, null=True, editable=False)
+ original_raw_filename = models.CharField(max_length=255, blank=True, null=True, editable=False)
+
# Project and keyword associations
projects = models.ManyToManyField('Project', blank=True)
projects.help_text = "Most artifacts are associated with only one project but "\
@@ -233,7 +244,35 @@ def save(self, *args, **kwargs):
first_time_saved = self.id is None
_logger.debug(f"For artifact.id={self.id}, first_time_saved={first_time_saved}")
-
+
+ # --- #1391: snapshot the original uploaded filename(s) ---
+ # The rename logic further down destroys the human-recognizable upload
+ # name (e.g. "MyTalk_v3_final.pptx"). We capture it here, but ONLY on a
+ # genuine new upload — never on a later edit or the m2m-triggered rename
+ # pass, where the file already carries the standardized name. A new
+ # upload is detectable two ways:
+ # 1. the first save of this artifact (the file is the just-uploaded
+ # one), or
+ # 2. an edit where the form reported the file field as changed, i.e.
+ # it is in the incoming update_fields BEFORE the rename block below
+ # appends to that list.
+ # On an edit we must also add the original_* field to update_fields so
+ # it persists (the first save writes all fields anyway).
+ incoming_update_fields = kwargs.get('update_fields')
+ for file_attr, original_attr in (('pdf_file', 'original_pdf_filename'),
+ ('raw_file', 'original_raw_filename')):
+ file_field = getattr(self, file_attr)
+ if not file_field:
+ continue
+ is_new_upload = first_time_saved or (
+ incoming_update_fields is not None and file_attr in incoming_update_fields
+ )
+ if is_new_upload:
+ setattr(self, original_attr, os.path.basename(file_field.name))
+ _logger.debug(f"Captured {original_attr}={getattr(self, original_attr)} for artifact.id={self.id}")
+ if not first_time_saved:
+ kwargs.setdefault('update_fields', []).append(original_attr)
+
# Note that "update_fields" is custom filled by our save_model in ArtifactAdmin
# It will never contain the m2m fields (e.g., authors, keywords, etc.) due to
# how Django handles m2m fields. Instead, you can hook up an m2m_changed signal
diff --git a/website/tests/test_artifact.py b/website/tests/test_artifact.py
index 7b2b2ef2..dac88ea5 100644
--- a/website/tests/test_artifact.py
+++ b/website/tests/test_artifact.py
@@ -1,12 +1,24 @@
"""Tests for Artifact model methods (filename-drift check, raw-file label)."""
+import os
from datetime import date
from unittest.mock import MagicMock, patch
+from django.core.files.uploadedfile import SimpleUploadedFile
+from django.core.management import call_command
from django.test import SimpleTestCase
-from website.models import Publication
+from website.models import Artifact, Publication, Talk
from website.tests.base import DatabaseTestCase
+from website.tests.factories import TalkFactory
+
+
+def _pdf(name):
+ return SimpleUploadedFile(name, b"%PDF-1.4 test", content_type="application/pdf")
+
+
+def _raw(name):
+ return SimpleUploadedFile(name, b"PKstub", content_type="application/octet-stream")
# --- Artifact filename check regression -----------------------------------
@@ -143,3 +155,171 @@ def test_resaving_artifact_without_pdf_does_not_crash(self):
pub.location = "Seattle, WA"
pub.save() # must not raise
self.assertFalse(bool(Publication.objects.get(pk=pub.pk).pdf_file))
+
+
+# --- #1391: original uploaded filename capture ----------------------------
+
+
+class OriginalFilenameCaptureTests(DatabaseTestCase):
+ """
+ Artifact.save() captures the human-recognizable upload name into
+ ``original_pdf_filename`` / ``original_raw_filename`` before the
+ standardized rename destroys it, but only on a *genuine new upload*
+ (issue #1391).
+ """
+
+ def test_new_upload_captures_original_and_survives_rename(self):
+ """
+ On the initial upload the original names are snapshotted, and they
+ survive the ``authors_changed`` rename pass that renames the files on
+ disk to the standardized Author_Title_VenueYear scheme.
+ """
+ person = self.make_person(last_name="Zhang")
+ talk = TalkFactory(
+ title="My Cool Talk",
+ forum_name="CHI",
+ date=date(2024, 1, 1),
+ pdf_file=_pdf("MyTalk_v3_final.pdf"),
+ raw_file=_raw("MyTalk_v3_final.pptx"),
+ authors=[person],
+ )
+ talk.refresh_from_db()
+
+ # The original upload names are preserved...
+ self.assertEqual(talk.original_pdf_filename, "MyTalk_v3_final.pdf")
+ self.assertEqual(talk.original_raw_filename, "MyTalk_v3_final.pptx")
+ # ...while the files on disk were renamed to the standardized scheme.
+ self.assertIn("Zhang", os.path.basename(talk.pdf_file.name))
+ self.assertNotIn("MyTalk_v3_final", os.path.basename(talk.pdf_file.name))
+
+ def test_replacing_file_on_edit_updates_original(self):
+ """
+ Uploading a replacement file through an edit (the admin passes the
+ changed field in ``update_fields``) re-captures the new upload name.
+ """
+ person = self.make_person(last_name="Zhang")
+ talk = TalkFactory(
+ title="My Cool Talk", forum_name="CHI", date=date(2024, 1, 1),
+ pdf_file=_pdf("MyTalk_v3_final.pdf"), authors=[person],
+ )
+
+ talk.pdf_file = _pdf("Revised_v2_FINAL.pdf")
+ talk.save(update_fields=["pdf_file"])
+ talk.refresh_from_db()
+
+ self.assertEqual(talk.original_pdf_filename, "Revised_v2_FINAL.pdf")
+
+ def test_metadata_only_edit_does_not_clobber_original(self):
+ """
+ A later edit that does not touch the file (only metadata) must NOT
+ overwrite the stored original with the now-standardized filename.
+ """
+ person = self.make_person(last_name="Zhang")
+ talk = TalkFactory(
+ title="My Cool Talk", forum_name="CHI", date=date(2024, 1, 1),
+ pdf_file=_pdf("MyTalk_v3_final.pdf"), authors=[person],
+ )
+
+ talk.location = "Honolulu, HI"
+ talk.save(update_fields=["location"])
+ talk.refresh_from_db()
+
+ self.assertEqual(talk.original_pdf_filename, "MyTalk_v3_final.pdf")
+
+
+class BackfillOriginalFilenamesTests(DatabaseTestCase):
+ """
+ The backfill_original_filenames command recovers the original upload name
+ for never-renamed artifacts (whose on-disk filename still IS the original),
+ leaves already-standardized rows blank, and never overwrites a value that
+ is already set (issue #1391).
+ """
+
+ def test_backfills_never_renamed_leaves_standardized_and_is_idempotent(self):
+ # (a) Legacy never-renamed talk: created without authors, so save()
+ # leaves the upload filename untouched. Then null the captured original
+ # to simulate a row predating this feature.
+ legacy = TalkFactory(
+ title="Legacy Talk", forum_name="UIST", date=date(2020, 1, 1),
+ pdf_file=_pdf("OldUpload_final.pdf"),
+ )
+ legacy_name = os.path.basename(legacy.pdf_file.name)
+ Talk.objects.filter(pk=legacy.pk).update(original_pdf_filename=None)
+
+ # (b) Standardized talk: created with an author, so save() renamed the
+ # file to the Author_Title_VenueYear scheme. Null its original too.
+ standard = TalkFactory(
+ title="Standard Talk", forum_name="CHI", date=date(2021, 1, 1),
+ pdf_file=_pdf("whatever_upload.pdf"),
+ authors=[self.make_person(last_name="Lee")],
+ )
+ Talk.objects.filter(pk=standard.pk).update(original_pdf_filename=None)
+
+ call_command("backfill_original_filenames")
+
+ legacy.refresh_from_db()
+ standard.refresh_from_db()
+ # Never-renamed file: its current basename is recorded as the original.
+ self.assertEqual(legacy.original_pdf_filename, legacy_name)
+ # Already-standardized file: original is unrecoverable, left blank.
+ self.assertIsNone(standard.original_pdf_filename)
+
+ # Idempotent: a value already set is never overwritten (only nulls fill).
+ Talk.objects.filter(pk=legacy.pk).update(
+ original_pdf_filename="ManuallyCorrected.pdf"
+ )
+ call_command("backfill_original_filenames")
+ legacy.refresh_from_db()
+ self.assertEqual(legacy.original_pdf_filename, "ManuallyCorrected.pdf")
+
+ def test_uniquified_standardized_name_is_not_backfilled(self):
+ """
+ When a standardized filename collided on disk, the rename appended a
+ "-" suffix (ensure_filename_is_unique). Such a name is still
+ a renamed file, NOT an original upload, so the backfill must leave it
+ blank rather than recording the standardized+suffix name.
+ """
+ person = self.make_person(last_name="Park")
+ talk = TalkFactory(
+ title="Unique Talk", forum_name="CHI", date=date(2022, 1, 1),
+ pdf_file=_pdf("anything.pdf"), authors=[person],
+ )
+ standardized = Artifact.generate_filename(talk)
+ # Simulate the uniquified on-disk name and a not-yet-backfilled row.
+ uniquified = f"talks/{standardized}-1782399772.42.pdf"
+ Talk.objects.filter(pk=talk.pk).update(
+ pdf_file=uniquified, original_pdf_filename=None
+ )
+
+ call_command("backfill_original_filenames")
+
+ talk.refresh_from_db()
+ self.assertIsNone(talk.original_pdf_filename)
+
+
+class OriginalUploadFilenamesDisplayTests(SimpleTestCase):
+ """ArtifactAdmin.original_upload_filenames read-only display (issue #1391)."""
+
+ def _render(self, pdf=None, raw=None):
+ from website.admin.artifact_admin import ArtifactAdmin
+ obj = MagicMock()
+ obj.original_pdf_filename = pdf
+ obj.original_raw_filename = raw
+ # The method does not use self; pass None.
+ return str(ArtifactAdmin.original_upload_filenames(None, obj))
+
+ def test_shows_both_filenames(self):
+ html = self._render(pdf="MyTalk_v3.pdf", raw="MyTalk_v3.pptx")
+ self.assertIn("PDF", html)
+ self.assertIn("MyTalk_v3.pdf", html)
+ self.assertIn("Raw file", html)
+ self.assertIn("MyTalk_v3.pptx", html)
+
+ def test_pdf_only_omits_raw_row(self):
+ html = self._render(pdf="MyTalk_v3.pdf", raw=None)
+ self.assertIn("MyTalk_v3.pdf", html)
+ self.assertNotIn("Raw file", html)
+
+ def test_placeholder_when_nothing_recorded(self):
+ html = self._render(pdf=None, raw=None)
+ self.assertIn("Not recorded", html)