project-nomad/admin/app/services/zim_service.ts
Chris Sherwood 38304d9012 fix(content): recognize Wikipedia downloads from mirror sources
When Wikipedia is downloaded via a custom mirror instead of the default
Kiwix server, the completion callback now matches by filename instead
of exact URL. This ensures the Wikipedia selector correctly shows
"Installed" status and triggers old-version cleanup regardless of
which mirror was used.

Also handles the case where no Wikipedia selection exists yet (file
downloaded before visiting the selector), creating the record
automatically.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 09:50:43 -07:00

717 lines
23 KiB
TypeScript

import {
ListRemoteZimFilesResponse,
RawRemoteZimFileEntry,
RemoteZimFileEntry,
} from '../../types/zim.js'
import axios from 'axios'
import { XMLParser } from 'fast-xml-parser'
import { isRawListRemoteZimFilesResponse, isRawRemoteZimFileEntry } from '../../util/zim.js'
import logger from '@adonisjs/core/services/logger'
import { DockerService } from './docker_service.js'
import { inject } from '@adonisjs/core'
import {
deleteFileIfExists,
ensureDirectoryExists,
getFileStatsIfExists,
listDirectoryContents,
ZIM_STORAGE_PATH,
} from '../utils/fs.js'
import { join, resolve, sep } from 'path'
import { WikipediaOption, WikipediaState } from '../../types/downloads.js'
import vine from '@vinejs/vine'
import { wikipediaOptionsFileSchema } from '#validators/curated_collections'
import WikipediaSelection from '#models/wikipedia_selection'
import InstalledResource from '#models/installed_resource'
import { RunDownloadJob } from '#jobs/run_download_job'
import { SERVICE_NAMES } from '../../constants/service_names.js'
import { CollectionManifestService } from './collection_manifest_service.js'
import type { CategoryWithStatus } from '../../types/collections.js'
import CustomLibrarySource from '#models/custom_library_source'
import { assertNotPrivateUrl } from '#validators/common'
const ZIM_MIME_TYPES = ['application/x-zim', 'application/x-openzim', 'application/octet-stream']
const WIKIPEDIA_OPTIONS_URL = 'https://raw.githubusercontent.com/Crosstalk-Solutions/project-nomad/refs/heads/main/collections/wikipedia.json'
@inject()
export class ZimService {
constructor(private dockerService: DockerService) { }
async list() {
const dirPath = join(process.cwd(), ZIM_STORAGE_PATH)
await ensureDirectoryExists(dirPath)
const all = await listDirectoryContents(dirPath)
const files = all.filter((item) => item.name.endsWith('.zim'))
return {
files,
}
}
async listRemote({
start,
count,
query,
}: {
start: number
count: number
query?: string
}): Promise<ListRemoteZimFilesResponse> {
const LIBRARY_BASE_URL = 'https://browse.library.kiwix.org/catalog/v2/entries'
const res = await axios.get(LIBRARY_BASE_URL, {
params: {
start: start,
count: count,
lang: 'eng',
...(query ? { q: query } : {}),
},
responseType: 'text',
})
const data = res.data
const parser = new XMLParser({
ignoreAttributes: false,
attributeNamePrefix: '',
textNodeName: '#text',
})
const result = parser.parse(data)
if (!isRawListRemoteZimFilesResponse(result)) {
throw new Error('Invalid response format from remote library')
}
const entries = result.feed.entry
? Array.isArray(result.feed.entry)
? result.feed.entry
: [result.feed.entry]
: []
const filtered = entries.filter((entry: any) => {
return isRawRemoteZimFileEntry(entry)
})
const mapped: (RemoteZimFileEntry | null)[] = filtered.map((entry: RawRemoteZimFileEntry) => {
const downloadLink = entry.link.find((link: any) => {
return (
typeof link === 'object' &&
'rel' in link &&
'length' in link &&
'href' in link &&
'type' in link &&
link.type === 'application/x-zim'
)
})
if (!downloadLink) {
return null
}
// downloadLink['href'] will end with .meta4, we need to remove that to get the actual download URL
const download_url = downloadLink['href'].substring(0, downloadLink['href'].length - 6)
const file_name = download_url.split('/').pop() || `${entry.title}.zim`
const sizeBytes = parseInt(downloadLink['length'], 10)
return {
id: entry.id,
title: entry.title,
updated: entry.updated,
summary: entry.summary,
size_bytes: sizeBytes || 0,
download_url: download_url,
author: entry.author.name,
file_name: file_name,
}
})
// Filter out any null entries (those without a valid download link)
// or files that already exist in the local storage
const existing = await this.list()
const existingKeys = new Set(existing.files.map((file) => file.name))
const withoutExisting = mapped.filter(
(entry): entry is RemoteZimFileEntry => entry !== null && !existingKeys.has(entry.file_name)
)
return {
items: withoutExisting,
has_more: result.feed.totalResults > start,
total_count: result.feed.totalResults,
}
}
async downloadRemote(url: string): Promise<{ filename: string; jobId?: string }> {
const parsed = new URL(url)
if (!parsed.pathname.endsWith('.zim')) {
throw new Error(`Invalid ZIM file URL: ${url}. URL must end with .zim`)
}
const existing = await RunDownloadJob.getByUrl(url)
if (existing) {
throw new Error('A download for this URL is already in progress')
}
// Extract the filename from the URL
const filename = url.split('/').pop()
if (!filename) {
throw new Error('Could not determine filename from URL')
}
const filepath = join(process.cwd(), ZIM_STORAGE_PATH, filename)
// Parse resource metadata for the download job
const parsedFilename = CollectionManifestService.parseZimFilename(filename)
const resourceMetadata = parsedFilename
? { resource_id: parsedFilename.resource_id, version: parsedFilename.version, collection_ref: null }
: undefined
// Dispatch a background download job
const result = await RunDownloadJob.dispatch({
url,
filepath,
timeout: 30000,
allowedMimeTypes: ZIM_MIME_TYPES,
forceNew: true,
filetype: 'zim',
resourceMetadata,
})
if (!result || !result.job) {
throw new Error('Failed to dispatch download job')
}
logger.info(`[ZimService] Dispatched background download job for ZIM file: ${filename}`)
return {
filename,
jobId: result.job.id,
}
}
async listCuratedCategories(): Promise<CategoryWithStatus[]> {
const manifestService = new CollectionManifestService()
return manifestService.getCategoriesWithStatus()
}
async downloadCategoryTier(categorySlug: string, tierSlug: string): Promise<string[] | null> {
const manifestService = new CollectionManifestService()
const spec = await manifestService.getSpecWithFallback<import('../../types/collections.js').ZimCategoriesSpec>('zim_categories')
if (!spec) {
throw new Error('Could not load ZIM categories spec')
}
const category = spec.categories.find((c) => c.slug === categorySlug)
if (!category) {
throw new Error(`Category not found: ${categorySlug}`)
}
const tier = category.tiers.find((t) => t.slug === tierSlug)
if (!tier) {
throw new Error(`Tier not found: ${tierSlug}`)
}
const allResources = CollectionManifestService.resolveTierResources(tier, category.tiers)
// Filter out already installed
const installed = await InstalledResource.query().where('resource_type', 'zim')
const installedIds = new Set(installed.map((r) => r.resource_id))
const toDownload = allResources.filter((r) => !installedIds.has(r.id))
if (toDownload.length === 0) return null
const downloadFilenames: string[] = []
for (const resource of toDownload) {
const existingJob = await RunDownloadJob.getByUrl(resource.url)
if (existingJob) {
logger.warn(`[ZimService] Download already in progress for ${resource.url}, skipping.`)
continue
}
const filename = resource.url.split('/').pop()
if (!filename) continue
downloadFilenames.push(filename)
const filepath = join(process.cwd(), ZIM_STORAGE_PATH, filename)
await RunDownloadJob.dispatch({
url: resource.url,
filepath,
timeout: 30000,
allowedMimeTypes: ZIM_MIME_TYPES,
forceNew: true,
filetype: 'zim',
resourceMetadata: {
resource_id: resource.id,
version: resource.version,
collection_ref: categorySlug,
},
})
}
return downloadFilenames.length > 0 ? downloadFilenames : null
}
async downloadRemoteSuccessCallback(urls: string[], restart = true) {
// Check if any URL is a Wikipedia download and handle it
for (const url of urls) {
if (url.includes('wikipedia_en_')) {
await this.onWikipediaDownloadComplete(url, true)
}
}
if (restart) {
// Check if there are any remaining ZIM download jobs before restarting
const { QueueService } = await import('./queue_service.js')
const queueService = new QueueService()
const queue = queueService.getQueue('downloads')
// Get all active and waiting jobs
const [activeJobs, waitingJobs] = await Promise.all([
queue.getActive(),
queue.getWaiting(),
])
// Filter out completed jobs (progress === 100) to avoid race condition
// where this job itself is still in the active queue
const activeIncompleteJobs = activeJobs.filter((job) => {
const progress = typeof job.progress === 'number' ? job.progress : 0
return progress < 100
})
// Check if any remaining incomplete jobs are ZIM downloads
const allJobs = [...activeIncompleteJobs, ...waitingJobs]
const hasRemainingZimJobs = allJobs.some((job) => job.data.filetype === 'zim')
if (hasRemainingZimJobs) {
logger.info('[ZimService] Skipping container restart - more ZIM downloads pending')
} else {
// Restart KIWIX container to pick up new ZIM file
logger.info('[ZimService] No more ZIM downloads pending - restarting KIWIX container')
await this.dockerService
.affectContainer(SERVICE_NAMES.KIWIX, 'restart')
.catch((error) => {
logger.error(`[ZimService] Failed to restart KIWIX container:`, error) // Don't stop the download completion, just log the error.
})
}
}
// Create InstalledResource entries for downloaded files
for (const url of urls) {
// Skip Wikipedia files (managed separately)
if (url.includes('wikipedia_en_')) continue
const filename = url.split('/').pop()
if (!filename) continue
const parsed = CollectionManifestService.parseZimFilename(filename)
if (!parsed) continue
const filepath = join(process.cwd(), ZIM_STORAGE_PATH, filename)
const stats = await getFileStatsIfExists(filepath)
try {
const { DateTime } = await import('luxon')
await InstalledResource.updateOrCreate(
{ resource_id: parsed.resource_id, resource_type: 'zim' },
{
version: parsed.version,
url: url,
file_path: filepath,
file_size_bytes: stats ? Number(stats.size) : null,
installed_at: DateTime.now(),
}
)
logger.info(`[ZimService] Created InstalledResource entry for: ${parsed.resource_id}`)
} catch (error) {
logger.error(`[ZimService] Failed to create InstalledResource for ${filename}:`, error)
}
}
}
async delete(file: string): Promise<void> {
let fileName = file
if (!fileName.endsWith('.zim')) {
fileName += '.zim'
}
const basePath = resolve(join(process.cwd(), ZIM_STORAGE_PATH))
const fullPath = resolve(join(basePath, fileName))
// Prevent path traversal — resolved path must stay within the storage directory
if (!fullPath.startsWith(basePath + sep)) {
throw new Error('Invalid filename')
}
const exists = await getFileStatsIfExists(fullPath)
if (!exists) {
throw new Error('not_found')
}
await deleteFileIfExists(fullPath)
// Clean up InstalledResource entry
const parsed = CollectionManifestService.parseZimFilename(fileName)
if (parsed) {
await InstalledResource.query()
.where('resource_id', parsed.resource_id)
.where('resource_type', 'zim')
.delete()
logger.info(`[ZimService] Deleted InstalledResource entry for: ${parsed.resource_id}`)
}
}
// Wikipedia selector methods
async getWikipediaOptions(): Promise<WikipediaOption[]> {
try {
const response = await axios.get(WIKIPEDIA_OPTIONS_URL)
const data = response.data
const validated = await vine.validate({
schema: wikipediaOptionsFileSchema,
data,
})
return validated.options
} catch (error) {
logger.error(`[ZimService] Failed to fetch Wikipedia options:`, error)
throw new Error('Failed to fetch Wikipedia options')
}
}
async getWikipediaSelection(): Promise<WikipediaSelection | null> {
// Get the single row from wikipedia_selections (there should only ever be one)
return WikipediaSelection.query().first()
}
async getWikipediaState(): Promise<WikipediaState> {
const options = await this.getWikipediaOptions()
const selection = await this.getWikipediaSelection()
return {
options,
currentSelection: selection
? {
optionId: selection.option_id,
status: selection.status,
filename: selection.filename,
url: selection.url,
}
: null,
}
}
async selectWikipedia(optionId: string): Promise<{ success: boolean; jobId?: string; message?: string }> {
const options = await this.getWikipediaOptions()
const selectedOption = options.find((opt) => opt.id === optionId)
if (!selectedOption) {
throw new Error(`Invalid Wikipedia option: ${optionId}`)
}
const currentSelection = await this.getWikipediaSelection()
// If same as currently installed, no action needed
if (currentSelection?.option_id === optionId && currentSelection.status === 'installed') {
return { success: true, message: 'Already installed' }
}
// Handle "none" option - delete current Wikipedia file and update DB
if (optionId === 'none') {
if (currentSelection?.filename) {
try {
await this.delete(currentSelection.filename)
logger.info(`[ZimService] Deleted Wikipedia file: ${currentSelection.filename}`)
} catch (error) {
// File might already be deleted, that's OK
logger.warn(`[ZimService] Could not delete Wikipedia file (may already be gone): ${currentSelection.filename}`)
}
}
// Update or create the selection record (always use first record)
if (currentSelection) {
currentSelection.option_id = 'none'
currentSelection.url = null
currentSelection.filename = null
currentSelection.status = 'none'
await currentSelection.save()
} else {
await WikipediaSelection.create({
option_id: 'none',
url: null,
filename: null,
status: 'none',
})
}
// Restart Kiwix to reflect the change
await this.dockerService
.affectContainer(SERVICE_NAMES.KIWIX, 'restart')
.catch((error) => {
logger.error(`[ZimService] Failed to restart Kiwix after Wikipedia removal:`, error)
})
return { success: true, message: 'Wikipedia removed' }
}
// Start download for the new Wikipedia option
if (!selectedOption.url) {
throw new Error('Selected Wikipedia option has no download URL')
}
// Check if already downloading
const existingJob = await RunDownloadJob.getByUrl(selectedOption.url)
if (existingJob) {
return { success: false, message: 'Download already in progress' }
}
// Extract filename from URL
const filename = selectedOption.url.split('/').pop()
if (!filename) {
throw new Error('Could not determine filename from URL')
}
const filepath = join(process.cwd(), ZIM_STORAGE_PATH, filename)
// Update or create selection record to show downloading status
let selection: WikipediaSelection
if (currentSelection) {
currentSelection.option_id = optionId
currentSelection.url = selectedOption.url
currentSelection.filename = filename
currentSelection.status = 'downloading'
await currentSelection.save()
selection = currentSelection
} else {
selection = await WikipediaSelection.create({
option_id: optionId,
url: selectedOption.url,
filename: filename,
status: 'downloading',
})
}
// Dispatch download job
const result = await RunDownloadJob.dispatch({
url: selectedOption.url,
filepath,
timeout: 30000,
allowedMimeTypes: ZIM_MIME_TYPES,
forceNew: true,
filetype: 'zim',
})
if (!result || !result.job) {
// Revert status on failure to dispatch
selection.option_id = currentSelection?.option_id || 'none'
selection.url = currentSelection?.url || null
selection.filename = currentSelection?.filename || null
selection.status = currentSelection?.status || 'none'
await selection.save()
throw new Error('Failed to dispatch download job')
}
logger.info(`[ZimService] Started Wikipedia download for ${optionId}: ${filename}`)
return {
success: true,
jobId: result.job.id,
message: 'Download started',
}
}
async onWikipediaDownloadComplete(url: string, success: boolean): Promise<void> {
const filename = url.split('/').pop() || ''
const selection = await this.getWikipediaSelection()
// Determine which Wikipedia option this file belongs to by matching filename
let matchedOptionId: string | null = null
try {
const options = await this.getWikipediaOptions()
for (const opt of options) {
if (opt.url && opt.url.split('/').pop() === filename) {
matchedOptionId = opt.id
break
}
}
} catch {
// If we can't fetch options, try to continue with existing selection
}
if (success) {
// Update or create the selection record
// Match by filename (not URL) so mirror downloads are recognized
if (selection) {
selection.option_id = matchedOptionId || selection.option_id
selection.url = url
selection.filename = filename
selection.status = 'installed'
await selection.save()
} else {
await WikipediaSelection.create({
option_id: matchedOptionId || 'unknown',
url: url,
filename: filename,
status: 'installed',
})
}
logger.info(`[ZimService] Wikipedia download completed successfully: ${filename}`)
// Delete old Wikipedia files (keep only the newly installed one)
const existingFiles = await this.list()
const wikipediaFiles = existingFiles.files.filter((f) =>
f.name.startsWith('wikipedia_en_') && f.name !== filename
)
for (const oldFile of wikipediaFiles) {
try {
await this.delete(oldFile.name)
logger.info(`[ZimService] Deleted old Wikipedia file: ${oldFile.name}`)
} catch (error) {
logger.warn(`[ZimService] Could not delete old Wikipedia file: ${oldFile.name}`, error)
}
}
} else {
// Download failed - update selection if it matches this file
if (selection && (!selection.filename || selection.filename === filename)) {
selection.status = 'failed'
await selection.save()
logger.error(`[ZimService] Wikipedia download failed for: ${filename}`)
} else {
logger.error(`[ZimService] Wikipedia download failed for: ${filename} (no matching selection)`)
}
}
}
// Custom library source management
async listCustomLibraries(): Promise<CustomLibrarySource[]> {
return CustomLibrarySource.all()
}
async addCustomLibrary(name: string, baseUrl: string): Promise<CustomLibrarySource> {
const count = await CustomLibrarySource.query().count('* as total')
const total = Number(count[0].$extras.total)
if (total >= 10) {
throw new Error('Maximum of 10 custom libraries allowed')
}
// Ensure URL ends with /
const normalizedUrl = baseUrl.endsWith('/') ? baseUrl : baseUrl + '/'
return CustomLibrarySource.create({
name,
base_url: normalizedUrl,
})
}
async removeCustomLibrary(id: number): Promise<void> {
const source = await CustomLibrarySource.find(id)
if (!source) {
throw new Error('Custom library not found')
}
if (source.is_default) {
throw new Error('Cannot remove a built-in mirror')
}
await source.delete()
}
async browseLibraryUrl(url: string): Promise<{
directories: { name: string; url: string }[]
files: { name: string; url: string; size_bytes: number | null }[]
}> {
assertNotPrivateUrl(url)
const normalizedUrl = url.endsWith('/') ? url : url + '/'
const res = await axios.get(normalizedUrl, {
responseType: 'text',
timeout: 15000,
headers: {
'Accept': 'text/html',
},
})
const html: string = res.data
const directories: { name: string; url: string }[] = []
const files: { name: string; url: string; size_bytes: number | null }[] = []
// Parse <a href="..."> links from HTML directory listings
// Works with Apache, Nginx, and most HTTP directory indexes
const linkRegex = /<a\s+[^>]*href="([^"]+)"[^>]*>([^<]*)<\/a>/gi
let match: RegExpExecArray | null
while ((match = linkRegex.exec(html)) !== null) {
const href = match[1]
// Skip parent directory, self, sorting links, absolute paths, and absolute URLs
if (!href || href === '../' || href === './' || href === '/' || href.startsWith('?') || href.startsWith('#')) {
continue
}
// Skip absolute paths (e.g., /mirror/kiwix.org/) and absolute URLs
if (href.startsWith('/') || href.startsWith('http://') || href.startsWith('https://')) {
continue
}
// Directory (ends with /)
if (href.endsWith('/')) {
const dirName = decodeURIComponent(href.replace(/\/$/, ''))
directories.push({
name: dirName,
url: normalizedUrl + href,
})
continue
}
// ZIM file
if (href.endsWith('.zim')) {
const fileName = decodeURIComponent(href)
const sizeBytes = this._extractSizeFromListing(html, href)
files.push({
name: fileName,
url: normalizedUrl + href,
size_bytes: sizeBytes,
})
}
}
// Sort directories alphabetically, files alphabetically
directories.sort((a, b) => a.name.localeCompare(b.name))
files.sort((a, b) => a.name.localeCompare(b.name))
return { directories, files }
}
/**
* Try to extract file size from HTML directory listing.
* Apache and Nginx directory listings typically show size near the filename.
* Returns bytes or null if not parseable.
*/
private _extractSizeFromListing(html: string, href: string): number | null {
// Apache style: <a href="file.zim">file.zim</a> 2024-01-15 10:30 5.1G
// Nginx style: <a href="file.zim">file.zim</a> 15-Jan-2024 10:30 5368709120
const escapedHref = href.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
const sizePattern = new RegExp(
escapedHref + `"[^<]*</a>\\s+[\\d-]+\\s+[\\d:]+\\s+([\\d.]+[KMGT]?)\\b`,
'i'
)
const sizeMatch = sizePattern.exec(html)
if (!sizeMatch) return null
const sizeStr = sizeMatch[1]
const num = parseFloat(sizeStr)
if (isNaN(num)) return null
// If it's a plain number (Nginx shows raw bytes)
if (/^\d+$/.test(sizeStr)) return num
// Apache uses K, M, G, T suffixes
const suffix = sizeStr.slice(-1).toUpperCase()
const multipliers: Record<string, number> = { K: 1024, M: 1024 ** 2, G: 1024 ** 3, T: 1024 ** 4 }
return multipliers[suffix] ? Math.round(num * multipliers[suffix]) : null
}
}