mirror of
https://github.com/Crosstalk-Solutions/project-nomad.git
synced 2026-05-29 15:46:49 +02:00
fix(ZIM): preserve co-existing Wikipedia corpora on cleanup (#884)
onWikipediaDownloadComplete was deleting every file whose name starts with `wikipedia_en_`, treating distinct corpora (simple, medicine, wikivoyage, climate_change, etc.) as competing versions of the same selection slot. Whichever wiki finished second silently wiped the other from disk. Match by filename stem instead — strip the trailing `_YYYY-MM(-DD).zim` date suffix and only delete files with the same stem as the new download. Different release dates of the same variant still get cleaned up; distinct variants are preserved. Extracted the predicate to `app/utils/zim_filename.ts` so the boundary is covered by unit tests (8 cases incl. the #884 repro scenario).
This commit is contained in:
parent
d621761412
commit
5193f74410
|
|
@ -7,6 +7,7 @@ import axios from 'axios'
|
|||
import * as cheerio from 'cheerio'
|
||||
import { XMLParser } from 'fast-xml-parser'
|
||||
import { isRawListRemoteZimFilesResponse, isRawRemoteZimFileEntry } from '../../util/zim.js'
|
||||
import { findReplacedWikipediaFiles } from '../utils/zim_filename.js'
|
||||
import logger from '@adonisjs/core/services/logger'
|
||||
import { DockerService } from './docker_service.js'
|
||||
import { inject } from '@adonisjs/core'
|
||||
|
|
@ -627,18 +628,21 @@ export class ZimService {
|
|||
|
||||
logger.info(`[ZimService] Wikipedia download completed successfully: ${filename}`)
|
||||
|
||||
// Delete old Wikipedia files (keep only the newly installed one)
|
||||
// Delete prior versions of THIS specific Wikipedia variant only.
|
||||
// Earlier logic deleted anything starting with `wikipedia_en_`, which silently
|
||||
// wiped distinct corpora the user had installed independently (issue #884).
|
||||
const existingFiles = await this.list()
|
||||
const wikipediaFiles = existingFiles.files.filter((f) =>
|
||||
f.name.startsWith('wikipedia_en_') && f.name !== filename
|
||||
const wikipediaFiles = findReplacedWikipediaFiles(
|
||||
filename,
|
||||
existingFiles.files.map((f) => f.name)
|
||||
)
|
||||
|
||||
for (const oldFile of wikipediaFiles) {
|
||||
try {
|
||||
await this.delete(oldFile.name)
|
||||
logger.info(`[ZimService] Deleted old Wikipedia file: ${oldFile.name}`)
|
||||
await this.delete(oldFile)
|
||||
logger.info(`[ZimService] Deleted old Wikipedia file: ${oldFile}`)
|
||||
} catch (error) {
|
||||
logger.warn(`[ZimService] Could not delete old Wikipedia file: ${oldFile.name}`, error)
|
||||
logger.warn(`[ZimService] Could not delete old Wikipedia file: ${oldFile}`, error)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
|
|
|||
26
admin/app/utils/zim_filename.ts
Normal file
26
admin/app/utils/zim_filename.ts
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
/**
|
||||
* Strip the trailing `_YYYY-MM(-DD).zim` date suffix from a Kiwix-style ZIM
|
||||
* filename so different release dates of the same variant share a stem
|
||||
* (e.g., `wikipedia_en_all_nopic`) while distinct corpora keep distinct stems
|
||||
* (`wikipedia_en_simple_all_nopic`, `wikipedia_en_medicine_nopic`, etc.).
|
||||
*/
|
||||
export function zimFilenameStem(name: string): string {
|
||||
return name.replace(/_\d{4}-\d{2}(?:-\d{2})?\.zim$/i, '')
|
||||
}
|
||||
|
||||
/**
|
||||
* Of the existing files, return only those that are prior-version replacements
|
||||
* of `currentFilename` — same Wikipedia variant stem, different release. Used
|
||||
* by the post-download cleanup to avoid deleting unrelated Wikipedia corpora
|
||||
* the user has installed independently (issue #884).
|
||||
*/
|
||||
export function findReplacedWikipediaFiles(
|
||||
currentFilename: string,
|
||||
existingNames: string[]
|
||||
): string[] {
|
||||
const currentStem = zimFilenameStem(currentFilename)
|
||||
return existingNames.filter(
|
||||
(n) =>
|
||||
n.startsWith('wikipedia_en_') && n !== currentFilename && zimFilenameStem(n) === currentStem
|
||||
)
|
||||
}
|
||||
73
admin/tests/unit/zim_filename.spec.ts
Normal file
73
admin/tests/unit/zim_filename.spec.ts
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
import * as assert from 'node:assert/strict'
|
||||
import { test } from 'node:test'
|
||||
|
||||
import { findReplacedWikipediaFiles, zimFilenameStem } from '../../app/utils/zim_filename.js'
|
||||
|
||||
test('zimFilenameStem strips YYYY-MM date suffix', () => {
|
||||
assert.equal(zimFilenameStem('wikipedia_en_all_nopic_2026-02.zim'), 'wikipedia_en_all_nopic')
|
||||
})
|
||||
|
||||
test('zimFilenameStem strips YYYY-MM-DD date suffix', () => {
|
||||
assert.equal(zimFilenameStem('wikipedia_en_all_nopic_2026-02-15.zim'), 'wikipedia_en_all_nopic')
|
||||
})
|
||||
|
||||
test('zimFilenameStem returns input unchanged when no date suffix present', () => {
|
||||
assert.equal(
|
||||
zimFilenameStem('wikipedia_en_my_custom_extract.zim'),
|
||||
'wikipedia_en_my_custom_extract.zim'
|
||||
)
|
||||
})
|
||||
|
||||
test('findReplacedWikipediaFiles cleans up older version of same variant', () => {
|
||||
assert.deepEqual(
|
||||
findReplacedWikipediaFiles('wikipedia_en_all_nopic_2026-04.zim', [
|
||||
'wikipedia_en_all_nopic_2026-02.zim',
|
||||
'wikipedia_en_all_nopic_2026-04.zim',
|
||||
]),
|
||||
['wikipedia_en_all_nopic_2026-02.zim']
|
||||
)
|
||||
})
|
||||
|
||||
test('findReplacedWikipediaFiles preserves co-existing distinct corpora — the #884 regression case', () => {
|
||||
assert.deepEqual(
|
||||
findReplacedWikipediaFiles('wikipedia_en_medicine_nopic_2026-04.zim', [
|
||||
'wikipedia_en_simple_all_nopic_2026-02.zim',
|
||||
'wikipedia_en_medicine_nopic_2026-04.zim',
|
||||
]),
|
||||
[]
|
||||
)
|
||||
})
|
||||
|
||||
test('findReplacedWikipediaFiles preserves all unrelated variants when a new variant lands', () => {
|
||||
assert.deepEqual(
|
||||
findReplacedWikipediaFiles('wikipedia_en_all_nopic_2026-04.zim', [
|
||||
'wikipedia_en_simple_all_nopic_2026-02.zim',
|
||||
'wikipedia_en_medicine_nopic_2026-04.zim',
|
||||
'wikipedia_en_wikivoyage_2026-02.zim',
|
||||
'wikipedia_en_climate_change_2025-08.zim',
|
||||
'wikipedia_en_all_nopic_2026-04.zim',
|
||||
]),
|
||||
[]
|
||||
)
|
||||
})
|
||||
|
||||
test('findReplacedWikipediaFiles ignores files without wikipedia_en_ prefix', () => {
|
||||
assert.deepEqual(
|
||||
findReplacedWikipediaFiles('wikipedia_en_all_nopic_2026-04.zim', [
|
||||
'wiktionary_en_all_2026-02.zim',
|
||||
'gutenberg_en_all_2026-01.zim',
|
||||
'wikipedia_en_all_nopic_2026-04.zim',
|
||||
]),
|
||||
[]
|
||||
)
|
||||
})
|
||||
|
||||
test('findReplacedWikipediaFiles preserves manually-named files without a date suffix', () => {
|
||||
assert.deepEqual(
|
||||
findReplacedWikipediaFiles('wikipedia_en_all_nopic_2026-04.zim', [
|
||||
'wikipedia_en_my_custom_extract.zim',
|
||||
'wikipedia_en_all_nopic_2026-04.zim',
|
||||
]),
|
||||
[]
|
||||
)
|
||||
})
|
||||
Loading…
Reference in New Issue
Block a user