From 5193f74410f21e98cccaa1dd302a1db0766bca8f Mon Sep 17 00:00:00 2001 From: Chris Sherwood Date: Thu, 14 May 2026 09:53:56 -0700 Subject: [PATCH] fix(ZIM): preserve co-existing Wikipedia corpora on cleanup (#884) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit onWikipediaDownloadComplete was deleting every file whose name starts with `wikipedia_en_`, treating distinct corpora (simple, medicine, wikivoyage, climate_change, etc.) as competing versions of the same selection slot. Whichever wiki finished second silently wiped the other from disk. Match by filename stem instead — strip the trailing `_YYYY-MM(-DD).zim` date suffix and only delete files with the same stem as the new download. Different release dates of the same variant still get cleaned up; distinct variants are preserved. Extracted the predicate to `app/utils/zim_filename.ts` so the boundary is covered by unit tests (8 cases incl. the #884 repro scenario). --- admin/app/services/zim_service.ts | 16 +++--- admin/app/utils/zim_filename.ts | 26 ++++++++++ admin/tests/unit/zim_filename.spec.ts | 73 +++++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 6 deletions(-) create mode 100644 admin/app/utils/zim_filename.ts create mode 100644 admin/tests/unit/zim_filename.spec.ts diff --git a/admin/app/services/zim_service.ts b/admin/app/services/zim_service.ts index 538d59f..4c8d430 100644 --- a/admin/app/services/zim_service.ts +++ b/admin/app/services/zim_service.ts @@ -7,6 +7,7 @@ import axios from 'axios' import * as cheerio from 'cheerio' import { XMLParser } from 'fast-xml-parser' import { isRawListRemoteZimFilesResponse, isRawRemoteZimFileEntry } from '../../util/zim.js' +import { findReplacedWikipediaFiles } from '../utils/zim_filename.js' import logger from '@adonisjs/core/services/logger' import { DockerService } from './docker_service.js' import { inject } from '@adonisjs/core' @@ -627,18 +628,21 @@ export class ZimService { logger.info(`[ZimService] Wikipedia download completed successfully: ${filename}`) - // Delete old Wikipedia files (keep only the newly installed one) + // Delete prior versions of THIS specific Wikipedia variant only. + // Earlier logic deleted anything starting with `wikipedia_en_`, which silently + // wiped distinct corpora the user had installed independently (issue #884). const existingFiles = await this.list() - const wikipediaFiles = existingFiles.files.filter((f) => - f.name.startsWith('wikipedia_en_') && f.name !== filename + const wikipediaFiles = findReplacedWikipediaFiles( + filename, + existingFiles.files.map((f) => f.name) ) for (const oldFile of wikipediaFiles) { try { - await this.delete(oldFile.name) - logger.info(`[ZimService] Deleted old Wikipedia file: ${oldFile.name}`) + await this.delete(oldFile) + logger.info(`[ZimService] Deleted old Wikipedia file: ${oldFile}`) } catch (error) { - logger.warn(`[ZimService] Could not delete old Wikipedia file: ${oldFile.name}`, error) + logger.warn(`[ZimService] Could not delete old Wikipedia file: ${oldFile}`, error) } } } else { diff --git a/admin/app/utils/zim_filename.ts b/admin/app/utils/zim_filename.ts new file mode 100644 index 0000000..a2fb5b2 --- /dev/null +++ b/admin/app/utils/zim_filename.ts @@ -0,0 +1,26 @@ +/** + * Strip the trailing `_YYYY-MM(-DD).zim` date suffix from a Kiwix-style ZIM + * filename so different release dates of the same variant share a stem + * (e.g., `wikipedia_en_all_nopic`) while distinct corpora keep distinct stems + * (`wikipedia_en_simple_all_nopic`, `wikipedia_en_medicine_nopic`, etc.). + */ +export function zimFilenameStem(name: string): string { + return name.replace(/_\d{4}-\d{2}(?:-\d{2})?\.zim$/i, '') +} + +/** + * Of the existing files, return only those that are prior-version replacements + * of `currentFilename` — same Wikipedia variant stem, different release. Used + * by the post-download cleanup to avoid deleting unrelated Wikipedia corpora + * the user has installed independently (issue #884). + */ +export function findReplacedWikipediaFiles( + currentFilename: string, + existingNames: string[] +): string[] { + const currentStem = zimFilenameStem(currentFilename) + return existingNames.filter( + (n) => + n.startsWith('wikipedia_en_') && n !== currentFilename && zimFilenameStem(n) === currentStem + ) +} diff --git a/admin/tests/unit/zim_filename.spec.ts b/admin/tests/unit/zim_filename.spec.ts new file mode 100644 index 0000000..4bc4733 --- /dev/null +++ b/admin/tests/unit/zim_filename.spec.ts @@ -0,0 +1,73 @@ +import * as assert from 'node:assert/strict' +import { test } from 'node:test' + +import { findReplacedWikipediaFiles, zimFilenameStem } from '../../app/utils/zim_filename.js' + +test('zimFilenameStem strips YYYY-MM date suffix', () => { + assert.equal(zimFilenameStem('wikipedia_en_all_nopic_2026-02.zim'), 'wikipedia_en_all_nopic') +}) + +test('zimFilenameStem strips YYYY-MM-DD date suffix', () => { + assert.equal(zimFilenameStem('wikipedia_en_all_nopic_2026-02-15.zim'), 'wikipedia_en_all_nopic') +}) + +test('zimFilenameStem returns input unchanged when no date suffix present', () => { + assert.equal( + zimFilenameStem('wikipedia_en_my_custom_extract.zim'), + 'wikipedia_en_my_custom_extract.zim' + ) +}) + +test('findReplacedWikipediaFiles cleans up older version of same variant', () => { + assert.deepEqual( + findReplacedWikipediaFiles('wikipedia_en_all_nopic_2026-04.zim', [ + 'wikipedia_en_all_nopic_2026-02.zim', + 'wikipedia_en_all_nopic_2026-04.zim', + ]), + ['wikipedia_en_all_nopic_2026-02.zim'] + ) +}) + +test('findReplacedWikipediaFiles preserves co-existing distinct corpora — the #884 regression case', () => { + assert.deepEqual( + findReplacedWikipediaFiles('wikipedia_en_medicine_nopic_2026-04.zim', [ + 'wikipedia_en_simple_all_nopic_2026-02.zim', + 'wikipedia_en_medicine_nopic_2026-04.zim', + ]), + [] + ) +}) + +test('findReplacedWikipediaFiles preserves all unrelated variants when a new variant lands', () => { + assert.deepEqual( + findReplacedWikipediaFiles('wikipedia_en_all_nopic_2026-04.zim', [ + 'wikipedia_en_simple_all_nopic_2026-02.zim', + 'wikipedia_en_medicine_nopic_2026-04.zim', + 'wikipedia_en_wikivoyage_2026-02.zim', + 'wikipedia_en_climate_change_2025-08.zim', + 'wikipedia_en_all_nopic_2026-04.zim', + ]), + [] + ) +}) + +test('findReplacedWikipediaFiles ignores files without wikipedia_en_ prefix', () => { + assert.deepEqual( + findReplacedWikipediaFiles('wikipedia_en_all_nopic_2026-04.zim', [ + 'wiktionary_en_all_2026-02.zim', + 'gutenberg_en_all_2026-01.zim', + 'wikipedia_en_all_nopic_2026-04.zim', + ]), + [] + ) +}) + +test('findReplacedWikipediaFiles preserves manually-named files without a date suffix', () => { + assert.deepEqual( + findReplacedWikipediaFiles('wikipedia_en_all_nopic_2026-04.zim', [ + 'wikipedia_en_my_custom_extract.zim', + 'wikipedia_en_all_nopic_2026-04.zim', + ]), + [] + ) +})