fix(ZIM): preserve co-existing Wikipedia corpora on cleanup (#884)

onWikipediaDownloadComplete was deleting every file whose name starts
with `wikipedia_en_`, treating distinct corpora (simple, medicine,
wikivoyage, climate_change, etc.) as competing versions of the same
selection slot. Whichever wiki finished second silently wiped the
other from disk.

Match by filename stem instead — strip the trailing `_YYYY-MM(-DD).zim`
date suffix and only delete files with the same stem as the new
download. Different release dates of the same variant still get cleaned
up; distinct variants are preserved.

Extracted the predicate to `app/utils/zim_filename.ts` so the boundary
is covered by unit tests (8 cases incl. the #884 repro scenario).
This commit is contained in:
Chris Sherwood 2026-05-14 09:53:56 -07:00 committed by Jake Turner
parent d621761412
commit 5193f74410
3 changed files with 109 additions and 6 deletions

View File

@ -7,6 +7,7 @@ import axios from 'axios'
import * as cheerio from 'cheerio'
import { XMLParser } from 'fast-xml-parser'
import { isRawListRemoteZimFilesResponse, isRawRemoteZimFileEntry } from '../../util/zim.js'
import { findReplacedWikipediaFiles } from '../utils/zim_filename.js'
import logger from '@adonisjs/core/services/logger'
import { DockerService } from './docker_service.js'
import { inject } from '@adonisjs/core'
@ -627,18 +628,21 @@ export class ZimService {
logger.info(`[ZimService] Wikipedia download completed successfully: ${filename}`)
// Delete old Wikipedia files (keep only the newly installed one)
// Delete prior versions of THIS specific Wikipedia variant only.
// Earlier logic deleted anything starting with `wikipedia_en_`, which silently
// wiped distinct corpora the user had installed independently (issue #884).
const existingFiles = await this.list()
const wikipediaFiles = existingFiles.files.filter((f) =>
f.name.startsWith('wikipedia_en_') && f.name !== filename
const wikipediaFiles = findReplacedWikipediaFiles(
filename,
existingFiles.files.map((f) => f.name)
)
for (const oldFile of wikipediaFiles) {
try {
await this.delete(oldFile.name)
logger.info(`[ZimService] Deleted old Wikipedia file: ${oldFile.name}`)
await this.delete(oldFile)
logger.info(`[ZimService] Deleted old Wikipedia file: ${oldFile}`)
} catch (error) {
logger.warn(`[ZimService] Could not delete old Wikipedia file: ${oldFile.name}`, error)
logger.warn(`[ZimService] Could not delete old Wikipedia file: ${oldFile}`, error)
}
}
} else {

View File

@ -0,0 +1,26 @@
/**
* Strip the trailing `_YYYY-MM(-DD).zim` date suffix from a Kiwix-style ZIM
* filename so different release dates of the same variant share a stem
* (e.g., `wikipedia_en_all_nopic`) while distinct corpora keep distinct stems
* (`wikipedia_en_simple_all_nopic`, `wikipedia_en_medicine_nopic`, etc.).
*/
export function zimFilenameStem(name: string): string {
return name.replace(/_\d{4}-\d{2}(?:-\d{2})?\.zim$/i, '')
}
/**
* Of the existing files, return only those that are prior-version replacements
* of `currentFilename` same Wikipedia variant stem, different release. Used
* by the post-download cleanup to avoid deleting unrelated Wikipedia corpora
* the user has installed independently (issue #884).
*/
export function findReplacedWikipediaFiles(
currentFilename: string,
existingNames: string[]
): string[] {
const currentStem = zimFilenameStem(currentFilename)
return existingNames.filter(
(n) =>
n.startsWith('wikipedia_en_') && n !== currentFilename && zimFilenameStem(n) === currentStem
)
}

View File

@ -0,0 +1,73 @@
import * as assert from 'node:assert/strict'
import { test } from 'node:test'
import { findReplacedWikipediaFiles, zimFilenameStem } from '../../app/utils/zim_filename.js'
test('zimFilenameStem strips YYYY-MM date suffix', () => {
assert.equal(zimFilenameStem('wikipedia_en_all_nopic_2026-02.zim'), 'wikipedia_en_all_nopic')
})
test('zimFilenameStem strips YYYY-MM-DD date suffix', () => {
assert.equal(zimFilenameStem('wikipedia_en_all_nopic_2026-02-15.zim'), 'wikipedia_en_all_nopic')
})
test('zimFilenameStem returns input unchanged when no date suffix present', () => {
assert.equal(
zimFilenameStem('wikipedia_en_my_custom_extract.zim'),
'wikipedia_en_my_custom_extract.zim'
)
})
test('findReplacedWikipediaFiles cleans up older version of same variant', () => {
assert.deepEqual(
findReplacedWikipediaFiles('wikipedia_en_all_nopic_2026-04.zim', [
'wikipedia_en_all_nopic_2026-02.zim',
'wikipedia_en_all_nopic_2026-04.zim',
]),
['wikipedia_en_all_nopic_2026-02.zim']
)
})
test('findReplacedWikipediaFiles preserves co-existing distinct corpora — the #884 regression case', () => {
assert.deepEqual(
findReplacedWikipediaFiles('wikipedia_en_medicine_nopic_2026-04.zim', [
'wikipedia_en_simple_all_nopic_2026-02.zim',
'wikipedia_en_medicine_nopic_2026-04.zim',
]),
[]
)
})
test('findReplacedWikipediaFiles preserves all unrelated variants when a new variant lands', () => {
assert.deepEqual(
findReplacedWikipediaFiles('wikipedia_en_all_nopic_2026-04.zim', [
'wikipedia_en_simple_all_nopic_2026-02.zim',
'wikipedia_en_medicine_nopic_2026-04.zim',
'wikipedia_en_wikivoyage_2026-02.zim',
'wikipedia_en_climate_change_2025-08.zim',
'wikipedia_en_all_nopic_2026-04.zim',
]),
[]
)
})
test('findReplacedWikipediaFiles ignores files without wikipedia_en_ prefix', () => {
assert.deepEqual(
findReplacedWikipediaFiles('wikipedia_en_all_nopic_2026-04.zim', [
'wiktionary_en_all_2026-02.zim',
'gutenberg_en_all_2026-01.zim',
'wikipedia_en_all_nopic_2026-04.zim',
]),
[]
)
})
test('findReplacedWikipediaFiles preserves manually-named files without a date suffix', () => {
assert.deepEqual(
findReplacedWikipediaFiles('wikipedia_en_all_nopic_2026-04.zim', [
'wikipedia_en_my_custom_extract.zim',
'wikipedia_en_all_nopic_2026-04.zim',
]),
[]
)
})