Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docker-entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,11 @@ echo "4.7 Running 'python manage.py backfill_project_visibility' to resolve is_v
echo "******************************************"
python manage.py backfill_project_visibility

echo "****************** STEP 4.7b/5: docker-entrypoint.sh ************************"
echo "4.7b Running 'python manage.py backfill_original_filenames' to recover original upload filenames for never-renamed artifacts (#1391)"
echo "******************************************"
python manage.py backfill_original_filenames

echo "****************** STEP 4.8/5: docker-entrypoint.sh ************************"
echo "4.8 Running 'python manage.py recompute_url_names' to de-collide historical url_names (#1206)"
echo "******************************************"
Expand Down
55 changes: 45 additions & 10 deletions website/admin/artifact_admin.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from django.contrib import admin
from website.models import Artifact
from django.contrib.admin import widgets
from django.utils.html import format_html
from django.utils.html import format_html, format_html_join
from sortedm2m_filter_horizontal_widget.forms import SortedFilteredSelectMultiple
from website.utils.upload_validators import PDF_EXTENSIONS, RAW_FILE_EXTENSIONS
from easy_thumbnails.files import get_thumbnailer
Expand Down Expand Up @@ -41,9 +41,10 @@ class Media:
# (Django auto-applies DISTINCT for the M2M join). Subclasses may extend this.
search_fields = ['title', 'forum_name', 'authors__first_name', 'authors__last_name']

# thumbnail_preview is a computed, read-only display (see below). It must be
# listed here so Django allows it in get_fieldsets() on the change form.
readonly_fields = ('thumbnail_preview',)
# thumbnail_preview and original_upload_filenames are computed, read-only
# displays (see below). They must be listed here so Django allows them in
# get_fieldsets() on the change form.
readonly_fields = ('thumbnail_preview', 'original_upload_filenames')

fieldsets = [
(None, {'fields': ['title', 'authors', 'date']}),
Expand Down Expand Up @@ -97,12 +98,45 @@ def thumbnail_preview(self, obj):
# Django auto-appends the trailing colon in the admin label.
thumbnail_preview.short_description = 'PDF thumbnail'

def original_upload_filenames(self, obj):
"""
Read-only provenance breadcrumb showing the human-recognizable name(s)
the file(s) had when uploaded (e.g. "MyTalk_v3_final.pptx"), before
``Artifact.save()`` renamed them to the standardized scheme. Admin-only
— sourced from the ``original_pdf_filename`` / ``original_raw_filename``
model fields (issue #1391).

Shows a muted placeholder when neither is recorded: historical rows
whose file was already renamed can't be recovered (the original name is
gone), and PDF-less / raw-file-less artifacts simply have nothing to
show.
"""
if obj is None:
return format_html('<span style="color:#666;">—</span>')
rows = []
if obj.original_pdf_filename:
rows.append(('PDF', obj.original_pdf_filename))
if obj.original_raw_filename:
rows.append(('Raw file', obj.original_raw_filename))
if not rows:
return format_html(
'<span style="color:#666;">Not recorded — uploaded before this '
'was tracked, or no file attached.</span>'
)
return format_html_join(
'', '<div><strong>{}:</strong> {}</div>', rows
)

# Django auto-appends the trailing colon in the admin label.
original_upload_filenames.short_description = 'Originally uploaded as'

def get_fieldsets(self, request, obj=None):
"""
Inject the read-only ``thumbnail_preview`` into the 'Files' fieldset on
the change form only. Done here (rather than in each child admin's
``fieldsets``) so Publication / Talk / Poster all get the preview.
On the Add form there is no saved thumbnail yet, so it is omitted.
Inject the read-only ``thumbnail_preview`` and ``original_upload_filenames``
displays into the 'Files' fieldset on the change form only. Done here
(rather than in each child admin's ``fieldsets``) so Publication / Talk /
Poster all get them. On the Add form there is no saved thumbnail or
captured upload name yet, so both are omitted.
"""
fieldsets = super().get_fieldsets(request, obj)
if obj is None:
Expand All @@ -113,8 +147,9 @@ def get_fieldsets(self, request, obj=None):
for name, opts in fieldsets:
if name == 'Files':
fields = list(opts.get('fields', []))
if 'thumbnail_preview' not in fields:
fields = fields + ['thumbnail_preview']
for extra in ('thumbnail_preview', 'original_upload_filenames'):
if extra not in fields:
fields = fields + [extra]
opts = {**opts, 'fields': fields}
updated.append((name, opts))
return updated
Expand Down
131 changes: 131 additions & 0 deletions website/management/commands/backfill_original_filenames.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import os
import logging

from django.core.management.base import BaseCommand

from website.models import Artifact, Talk, Poster, Publication

# This retrieves a Python logging instance (or creates it)
_logger = logging.getLogger(__name__)


class Command(BaseCommand):
help = (
"Backfills Artifact.original_pdf_filename / original_raw_filename for "
"existing talks, posters, and publications whose file was never renamed "
"to the standardized scheme (issue #1391). Production has many such rows "
"predating the auto-rename feature, so their on-disk filename still IS "
"the original upload name and can be recovered. Rows whose file already "
"matches the standardized scheme have lost their original name and are "
"left blank. Only fills empty values, so it never overwrites a name "
"already captured at upload time. Idempotent: safe to run on every "
"container start."
)

# The concrete artifact models whose files this backfill covers.
MODELS = (Talk, Poster, Publication)

# (FileField attr, original-name field attr) pairs to backfill.
FILE_FIELDS = (
('pdf_file', 'original_pdf_filename'),
('raw_file', 'original_raw_filename'),
)

def add_arguments(self, parser):
parser.add_argument(
"--dry-run",
action="store_true",
help="Report what would change without writing to the database.",
)

def handle(self, *args, **options):
dry_run = options["dry_run"]
_logger.debug(
f"Running backfill_original_filenames.py (dry_run={dry_run}) to "
f"recover original upload filenames for never-renamed artifacts."
)

total_updated = 0
total_skipped = 0
for model in self.MODELS:
updated, skipped = self._backfill_model(model, dry_run)
total_updated += updated
total_skipped += skipped

verb = "Would update" if dry_run else "Updated"
_logger.info(
f"backfill_original_filenames: {verb} {total_updated} filename "
f"field(s); skipped {total_skipped} (already standardized — original "
f"name unrecoverable)."
)
_logger.debug("Completed backfill_original_filenames.py")

def _backfill_model(self, model, dry_run):
"""Backfill both file fields for one concrete artifact model.

Returns a ``(num_updated, num_skipped)`` tuple counting individual
filename fields touched / deliberately left blank.
"""
num_updated = 0
num_skipped = 0
for file_attr, original_attr in self.FILE_FIELDS:
# Only rows that have a file but no captured original name yet.
candidates = (
model.objects.filter(**{f"{original_attr}__isnull": True})
.exclude(**{file_attr: ""})
.exclude(**{f"{file_attr}__isnull": True})
)
# generate_filename() reads the first author's last name, so prefetch
# authors to avoid a per-row query.
candidates = candidates.prefetch_related("authors")

for artifact in candidates:
file_field = getattr(artifact, file_attr)
if not file_field:
continue

current_basename = os.path.basename(file_field.name)
current_no_ext = os.path.splitext(current_basename)[0]
standardized_no_ext = Artifact.generate_filename(artifact)

# Treat the file as already-standardized when its name equals the
# standardized scheme OR is a uniquified variant of it. When a
# standardized name collides on disk, ensure_filename_is_unique()
# (fileutils.py) appends "-<timestamp>" — e.g.
# "Lee_Talk_CHI2021-1782399772.42.pdf" — so the on-disk name
# still STARTS WITH the standardized base. Matching only on exact
# equality would misread those as never-renamed and record the
# standardized+suffix name as the "original" — a false positive.
already_standardized = (
current_no_ext == standardized_no_ext
or current_no_ext.startswith(standardized_no_ext + "-")
)
if already_standardized:
# Already renamed — the original upload name is gone.
_logger.debug(
f"Skipping {model.__name__} id={artifact.pk} {file_attr}="
f"'{current_basename}': already standardized."
)
num_skipped += 1
continue

# Never renamed: the current on-disk name is the original.
if dry_run:
_logger.debug(
f"[dry-run] Would set {original_attr}='{current_basename}' "
f"for {model.__name__} id={artifact.pk} '{artifact.title}'"
)
else:
# Write directly via the queryset so this stays a pure data
# backfill — no file-rename / thumbnail side effects from the
# model's save().
model.objects.filter(pk=artifact.pk).update(
**{original_attr: current_basename}
)
_logger.debug(
f"Set {original_attr}='{current_basename}' for "
f"{model.__name__} id={artifact.pk} '{artifact.title}'"
)
num_updated += 1

return num_updated, num_skipped
43 changes: 41 additions & 2 deletions website/models/artifact.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,18 @@ class Artifact(models.Model):
raw_file.help_text = "The raw file (e.g., pptx, keynote) for the artifact. While not required, this is "\
"<b>highly</b> recommended as it creates a better archive of the work"
thumbnail = models.ImageField(upload_to=get_upload_thumbnail_dir, editable=False, null=True, max_length=255)


# Provenance: the human-recognizable name of the file as it was uploaded
# (e.g., "MyTalk_v3_final.pptx"), before save() renames it to the
# standardized Author_Title_VenueYear scheme. Admin-only (editable=False so
# it never appears on the public-facing form; surfaced read-only on the
# admin change form). Captured only on a genuine new upload — see save().
# Existing rows whose file was already renamed can't be recovered and stay
# null (the backfill_original_filenames command fills the never-renamed
# ones whose on-disk name is still the original). See issue #1391.
original_pdf_filename = models.CharField(max_length=255, blank=True, null=True, editable=False)
original_raw_filename = models.CharField(max_length=255, blank=True, null=True, editable=False)

# Project and keyword associations
projects = models.ManyToManyField('Project', blank=True)
projects.help_text = "Most artifacts are associated with only one project but "\
Expand Down Expand Up @@ -233,7 +244,35 @@ def save(self, *args, **kwargs):

first_time_saved = self.id is None
_logger.debug(f"For artifact.id={self.id}, first_time_saved={first_time_saved}")


# --- #1391: snapshot the original uploaded filename(s) ---
# The rename logic further down destroys the human-recognizable upload
# name (e.g. "MyTalk_v3_final.pptx"). We capture it here, but ONLY on a
# genuine new upload — never on a later edit or the m2m-triggered rename
# pass, where the file already carries the standardized name. A new
# upload is detectable two ways:
# 1. the first save of this artifact (the file is the just-uploaded
# one), or
# 2. an edit where the form reported the file field as changed, i.e.
# it is in the incoming update_fields BEFORE the rename block below
# appends to that list.
# On an edit we must also add the original_* field to update_fields so
# it persists (the first save writes all fields anyway).
incoming_update_fields = kwargs.get('update_fields')
for file_attr, original_attr in (('pdf_file', 'original_pdf_filename'),
('raw_file', 'original_raw_filename')):
file_field = getattr(self, file_attr)
if not file_field:
continue
is_new_upload = first_time_saved or (
incoming_update_fields is not None and file_attr in incoming_update_fields
)
if is_new_upload:
setattr(self, original_attr, os.path.basename(file_field.name))
_logger.debug(f"Captured {original_attr}={getattr(self, original_attr)} for artifact.id={self.id}")
if not first_time_saved:
kwargs.setdefault('update_fields', []).append(original_attr)

# Note that "update_fields" is custom filled by our save_model in ArtifactAdmin
# It will never contain the m2m fields (e.g., authors, keywords, etc.) due to
# how Django handles m2m fields. Instead, you can hook up an m2m_changed signal
Expand Down
Loading
Loading