project-nomad/admin/app/services/zim_service.ts
Jake Turner 36b6d8ed7a fix: rework content tier system to dynamically determine install status
Removes the InstalledTier model and instead checks presence of files on-the-fly. Avoid broken state by handling on the server-side vs. marking as installed by client-side API call
2026-02-04 22:58:21 -08:00

658 lines
22 KiB
TypeScript

import {
ListRemoteZimFilesResponse,
RawRemoteZimFileEntry,
RemoteZimFileEntry,
} from '../../types/zim.js'
import axios from 'axios'
import { XMLParser } from 'fast-xml-parser'
import { isRawListRemoteZimFilesResponse, isRawRemoteZimFileEntry } from '../../util/zim.js'
import logger from '@adonisjs/core/services/logger'
import { DockerService } from './docker_service.js'
import { inject } from '@adonisjs/core'
import {
deleteFileIfExists,
ensureDirectoryExists,
getFileStatsIfExists,
listDirectoryContents,
ZIM_STORAGE_PATH,
} from '../utils/fs.js'
import { join } from 'path'
import { CuratedCategory, CuratedCollectionWithStatus, CuratedCollectionsFile, WikipediaOption, WikipediaState } from '../../types/downloads.js'
import vine from '@vinejs/vine'
import { curatedCategoriesFileSchema, curatedCollectionsFileSchema, wikipediaOptionsFileSchema } from '#validators/curated_collections'
import CuratedCollection from '#models/curated_collection'
import CuratedCollectionResource from '#models/curated_collection_resource'
import WikipediaSelection from '#models/wikipedia_selection'
import ZimFileMetadata from '#models/zim_file_metadata'
import { RunDownloadJob } from '#jobs/run_download_job'
import { DownloadCollectionOperation, DownloadRemoteSuccessCallback } from '../../types/files.js'
import { SERVICE_NAMES } from '../../constants/service_names.js'
const ZIM_MIME_TYPES = ['application/x-zim', 'application/x-openzim', 'application/octet-stream']
const CATEGORIES_URL = 'https://raw.githubusercontent.com/Crosstalk-Solutions/project-nomad/refs/heads/master/collections/kiwix-categories.json'
const COLLECTIONS_URL =
'https://github.com/Crosstalk-Solutions/project-nomad/raw/refs/heads/master/collections/kiwix.json'
const WIKIPEDIA_OPTIONS_URL = 'https://raw.githubusercontent.com/Crosstalk-Solutions/project-nomad/refs/heads/master/collections/wikipedia.json'
interface IZimService {
downloadCollection: DownloadCollectionOperation
downloadRemoteSuccessCallback: DownloadRemoteSuccessCallback
}
@inject()
export class ZimService implements IZimService {
constructor(private dockerService: DockerService) {}
async list() {
const dirPath = join(process.cwd(), ZIM_STORAGE_PATH)
await ensureDirectoryExists(dirPath)
const all = await listDirectoryContents(dirPath)
const files = all.filter((item) => item.name.endsWith('.zim'))
// Fetch metadata for all files
const metadataRecords = await ZimFileMetadata.all()
const metadataMap = new Map(metadataRecords.map((m) => [m.filename, m]))
// Enrich files with metadata
const enrichedFiles = files.map((file) => {
const metadata = metadataMap.get(file.name)
return {
...file,
title: metadata?.title || null,
summary: metadata?.summary || null,
author: metadata?.author || null,
size_bytes: metadata?.size_bytes || null,
}
})
return {
files: enrichedFiles,
}
}
async listRemote({
start,
count,
query,
}: {
start: number
count: number
query?: string
}): Promise<ListRemoteZimFilesResponse> {
const LIBRARY_BASE_URL = 'https://browse.library.kiwix.org/catalog/v2/entries'
const res = await axios.get(LIBRARY_BASE_URL, {
params: {
start: start,
count: count,
lang: 'eng',
...(query ? { q: query } : {}),
},
responseType: 'text',
})
const data = res.data
const parser = new XMLParser({
ignoreAttributes: false,
attributeNamePrefix: '',
textNodeName: '#text',
})
const result = parser.parse(data)
if (!isRawListRemoteZimFilesResponse(result)) {
throw new Error('Invalid response format from remote library')
}
const entries = result.feed.entry
? Array.isArray(result.feed.entry)
? result.feed.entry
: [result.feed.entry]
: []
const filtered = entries.filter((entry: any) => {
return isRawRemoteZimFileEntry(entry)
})
const mapped: (RemoteZimFileEntry | null)[] = filtered.map((entry: RawRemoteZimFileEntry) => {
const downloadLink = entry.link.find((link: any) => {
return (
typeof link === 'object' &&
'rel' in link &&
'length' in link &&
'href' in link &&
'type' in link &&
link.type === 'application/x-zim'
)
})
if (!downloadLink) {
return null
}
// downloadLink['href'] will end with .meta4, we need to remove that to get the actual download URL
const download_url = downloadLink['href'].substring(0, downloadLink['href'].length - 6)
const file_name = download_url.split('/').pop() || `${entry.title}.zim`
const sizeBytes = parseInt(downloadLink['length'], 10)
return {
id: entry.id,
title: entry.title,
updated: entry.updated,
summary: entry.summary,
size_bytes: sizeBytes || 0,
download_url: download_url,
author: entry.author.name,
file_name: file_name,
}
})
// Filter out any null entries (those without a valid download link)
// or files that already exist in the local storage
const existing = await this.list()
const existingKeys = new Set(existing.files.map((file) => file.name))
const withoutExisting = mapped.filter(
(entry): entry is RemoteZimFileEntry => entry !== null && !existingKeys.has(entry.file_name)
)
return {
items: withoutExisting,
has_more: result.feed.totalResults > start,
total_count: result.feed.totalResults,
}
}
async downloadRemote(
url: string,
metadata?: { title: string; summary?: string; author?: string; size_bytes?: number }
): Promise<{ filename: string; jobId?: string }> {
const parsed = new URL(url)
if (!parsed.pathname.endsWith('.zim')) {
throw new Error(`Invalid ZIM file URL: ${url}. URL must end with .zim`)
}
const existing = await RunDownloadJob.getByUrl(url)
if (existing) {
throw new Error('A download for this URL is already in progress')
}
// Extract the filename from the URL
const filename = url.split('/').pop()
if (!filename) {
throw new Error('Could not determine filename from URL')
}
const filepath = join(process.cwd(), ZIM_STORAGE_PATH, filename)
// Store metadata if provided
if (metadata) {
await ZimFileMetadata.updateOrCreate(
{ filename },
{
title: metadata.title,
summary: metadata.summary || null,
author: metadata.author || null,
size_bytes: metadata.size_bytes || null,
}
)
logger.info(`[ZimService] Stored metadata for ZIM file: ${filename}`)
}
// Dispatch a background download job
const result = await RunDownloadJob.dispatch({
url,
filepath,
timeout: 30000,
allowedMimeTypes: ZIM_MIME_TYPES,
forceNew: true,
filetype: 'zim',
})
if (!result || !result.job) {
throw new Error('Failed to dispatch download job')
}
logger.info(`[ZimService] Dispatched background download job for ZIM file: ${filename}`)
return {
filename,
jobId: result.job.id,
}
}
async downloadCollection(slug: string) {
const collection = await CuratedCollection.query().where('slug', slug).andWhere('type', 'zim').first()
if (!collection) {
return null
}
const resources = await collection.related('resources').query().where('downloaded', false)
if (resources.length === 0) {
return null
}
const downloadUrls = resources.map((res) => res.url)
const downloadFilenames: string[] = []
for (const url of downloadUrls) {
const existing = await RunDownloadJob.getByUrl(url)
if (existing) {
logger.warn(`[ZimService] Download already in progress for URL ${url}, skipping.`)
continue
}
// Extract the filename from the URL
const filename = url.split('/').pop()
if (!filename) {
logger.warn(`[ZimService] Could not determine filename from URL ${url}, skipping.`)
continue
}
downloadFilenames.push(filename)
const filepath = join(process.cwd(), ZIM_STORAGE_PATH, filename)
await RunDownloadJob.dispatch({
url,
filepath,
timeout: 30000,
allowedMimeTypes: ZIM_MIME_TYPES,
forceNew: true,
filetype: 'zim',
})
}
return downloadFilenames.length > 0 ? downloadFilenames : null
}
async downloadRemoteSuccessCallback(urls: string[], restart = true) {
// Check if any URL is a Wikipedia download and handle it
for (const url of urls) {
if (url.includes('wikipedia_en_')) {
await this.onWikipediaDownloadComplete(url, true)
}
}
if (restart) {
// Check if there are any remaining ZIM download jobs before restarting
const { QueueService } = await import('./queue_service.js')
const queueService = new QueueService()
const queue = queueService.getQueue('downloads')
// Get all active and waiting jobs
const [activeJobs, waitingJobs] = await Promise.all([
queue.getActive(),
queue.getWaiting(),
])
// Filter out completed jobs (progress === 100) to avoid race condition
// where this job itself is still in the active queue
const activeIncompleteJobs = activeJobs.filter((job) => {
const progress = typeof job.progress === 'number' ? job.progress : 0
return progress < 100
})
// Check if any remaining incomplete jobs are ZIM downloads
const allJobs = [...activeIncompleteJobs, ...waitingJobs]
const hasRemainingZimJobs = allJobs.some((job) => job.data.filetype === 'zim')
if (hasRemainingZimJobs) {
logger.info('[ZimService] Skipping container restart - more ZIM downloads pending')
} else {
// Restart KIWIX container to pick up new ZIM file
logger.info('[ZimService] No more ZIM downloads pending - restarting KIWIX container')
await this.dockerService
.affectContainer(SERVICE_NAMES.KIWIX, 'restart')
.catch((error) => {
logger.error(`[ZimService] Failed to restart KIWIX container:`, error) // Don't stop the download completion, just log the error.
})
}
}
// Mark any curated collection resources with this download URL as downloaded
const resources = await CuratedCollectionResource.query().whereIn('url', urls)
for (const resource of resources) {
resource.downloaded = true
await resource.save()
}
}
async listCuratedCategories(): Promise<CuratedCategory[]> {
try {
const response = await axios.get(CATEGORIES_URL)
const data = response.data
const validated = await vine.validate({
schema: curatedCategoriesFileSchema,
data,
});
// Dynamically determine installed tier for each category
const categoriesWithStatus = await Promise.all(
validated.categories.map(async (category) => {
const installedTierSlug = await this.getInstalledTierForCategory(category)
return {
...category,
installedTierSlug,
}
})
)
return categoriesWithStatus
} catch (error) {
logger.error(`[ZimService] Failed to fetch curated categories:`, error)
throw new Error('Failed to fetch curated categories or invalid format was received')
}
}
/**
* Dynamically determines which tier is installed for a category by checking
* which tier's resources are all downloaded. Returns the highest tier that
* is fully installed (considering that higher tiers include lower tier resources)
*/
private async getInstalledTierForCategory(category: CuratedCategory): Promise<string | undefined> {
const { files: diskFiles } = await this.list()
const diskFilenames = new Set(diskFiles.map((f) => f.name))
// Get all CuratedCollectionResources marked as downloaded
const downloadedResources = await CuratedCollectionResource.query()
.where('downloaded', true)
.select('url')
const downloadedUrls = new Set(downloadedResources.map((r) => r.url))
// Check each tier from highest to lowest (assuming tiers are ordered from low to high)
// We check in reverse to find the highest fully-installed tier
const reversedTiers = [...category.tiers].reverse()
for (const tier of reversedTiers) {
const allResourcesInstalled = tier.resources.every((resource) => {
// Check if resource is marked as downloaded in database
if (downloadedUrls.has(resource.url)) {
return true
}
// Fallback: check if file exists on disk (for resources not tracked in CuratedCollectionResource)
const filename = resource.url.split('/').pop()
if (filename && diskFilenames.has(filename)) {
return true
}
return false
})
if (allResourcesInstalled && tier.resources.length > 0) {
return tier.slug
}
}
return undefined
}
async listCuratedCollections(): Promise<CuratedCollectionWithStatus[]> {
const collections = await CuratedCollection.query().where('type', 'zim').preload('resources')
return collections.map((collection) => ({
...(collection.serialize() as CuratedCollection),
all_downloaded: collection.resources.every((res) => res.downloaded),
}))
}
async fetchLatestCollections(): Promise<boolean> {
try {
const response = await axios.get<CuratedCollectionsFile>(COLLECTIONS_URL)
const validated = await vine.validate({
schema: curatedCollectionsFileSchema,
data: response.data,
})
for (const collection of validated.collections) {
const { resources, ...restCollection } = collection; // we'll handle resources separately
// Upsert the collection itself
await CuratedCollection.updateOrCreate(
{ slug: restCollection.slug },
{
...restCollection,
type: 'zim',
}
)
logger.info(`[ZimService] Upserted curated collection: ${restCollection.slug}`)
// Upsert collection's resources
const resourcesResult = await CuratedCollectionResource.updateOrCreateMany('url', resources.map((res) => ({
...res,
curated_collection_slug: restCollection.slug, // add the foreign key
})))
logger.info(
`[ZimService] Upserted ${resourcesResult.length} resources for collection: ${restCollection.slug}`
)
}
return true
} catch (error) {
logger.error(`[ZimService] Failed to download latest Kiwix collections:`, error)
return false
}
}
async delete(file: string): Promise<void> {
let fileName = file
if (!fileName.endsWith('.zim')) {
fileName += '.zim'
}
const fullPath = join(process.cwd(), ZIM_STORAGE_PATH, fileName)
const exists = await getFileStatsIfExists(fullPath)
if (!exists) {
throw new Error('not_found')
}
await deleteFileIfExists(fullPath)
// Clean up metadata
await ZimFileMetadata.query().where('filename', fileName).delete()
logger.info(`[ZimService] Deleted metadata for ZIM file: ${fileName}`)
}
// Wikipedia selector methods
async getWikipediaOptions(): Promise<WikipediaOption[]> {
try {
const response = await axios.get(WIKIPEDIA_OPTIONS_URL)
const data = response.data
const validated = await vine.validate({
schema: wikipediaOptionsFileSchema,
data,
})
return validated.options
} catch (error) {
logger.error(`[ZimService] Failed to fetch Wikipedia options:`, error)
throw new Error('Failed to fetch Wikipedia options')
}
}
async getWikipediaSelection(): Promise<WikipediaSelection | null> {
// Get the single row from wikipedia_selections (there should only ever be one)
return WikipediaSelection.query().first()
}
async getWikipediaState(): Promise<WikipediaState> {
const options = await this.getWikipediaOptions()
const selection = await this.getWikipediaSelection()
return {
options,
currentSelection: selection
? {
optionId: selection.option_id,
status: selection.status,
filename: selection.filename,
url: selection.url,
}
: null,
}
}
async selectWikipedia(optionId: string): Promise<{ success: boolean; jobId?: string; message?: string }> {
const options = await this.getWikipediaOptions()
const selectedOption = options.find((opt) => opt.id === optionId)
if (!selectedOption) {
throw new Error(`Invalid Wikipedia option: ${optionId}`)
}
const currentSelection = await this.getWikipediaSelection()
// If same as currently installed, no action needed
if (currentSelection?.option_id === optionId && currentSelection.status === 'installed') {
return { success: true, message: 'Already installed' }
}
// Handle "none" option - delete current Wikipedia file and update DB
if (optionId === 'none') {
if (currentSelection?.filename) {
try {
await this.delete(currentSelection.filename)
logger.info(`[ZimService] Deleted Wikipedia file: ${currentSelection.filename}`)
} catch (error) {
// File might already be deleted, that's OK
logger.warn(`[ZimService] Could not delete Wikipedia file (may already be gone): ${currentSelection.filename}`)
}
}
// Update or create the selection record (always use first record)
if (currentSelection) {
currentSelection.option_id = 'none'
currentSelection.url = null
currentSelection.filename = null
currentSelection.status = 'none'
await currentSelection.save()
} else {
await WikipediaSelection.create({
option_id: 'none',
url: null,
filename: null,
status: 'none',
})
}
// Restart Kiwix to reflect the change
await this.dockerService
.affectContainer(SERVICE_NAMES.KIWIX, 'restart')
.catch((error) => {
logger.error(`[ZimService] Failed to restart Kiwix after Wikipedia removal:`, error)
})
return { success: true, message: 'Wikipedia removed' }
}
// Start download for the new Wikipedia option
if (!selectedOption.url) {
throw new Error('Selected Wikipedia option has no download URL')
}
// Check if already downloading
const existingJob = await RunDownloadJob.getByUrl(selectedOption.url)
if (existingJob) {
return { success: false, message: 'Download already in progress' }
}
// Extract filename from URL
const filename = selectedOption.url.split('/').pop()
if (!filename) {
throw new Error('Could not determine filename from URL')
}
const filepath = join(process.cwd(), ZIM_STORAGE_PATH, filename)
// Update or create selection record to show downloading status
let selection: WikipediaSelection
if (currentSelection) {
currentSelection.option_id = optionId
currentSelection.url = selectedOption.url
currentSelection.filename = filename
currentSelection.status = 'downloading'
await currentSelection.save()
selection = currentSelection
} else {
selection = await WikipediaSelection.create({
option_id: optionId,
url: selectedOption.url,
filename: filename,
status: 'downloading',
})
}
// Dispatch download job
const result = await RunDownloadJob.dispatch({
url: selectedOption.url,
filepath,
timeout: 30000,
allowedMimeTypes: ZIM_MIME_TYPES,
forceNew: true,
filetype: 'zim',
})
if (!result || !result.job) {
// Revert status on failure to dispatch
selection.option_id = currentSelection?.option_id || 'none'
selection.url = currentSelection?.url || null
selection.filename = currentSelection?.filename || null
selection.status = currentSelection?.status || 'none'
await selection.save()
throw new Error('Failed to dispatch download job')
}
logger.info(`[ZimService] Started Wikipedia download for ${optionId}: ${filename}`)
return {
success: true,
jobId: result.job.id,
message: 'Download started',
}
}
async onWikipediaDownloadComplete(url: string, success: boolean): Promise<void> {
const selection = await this.getWikipediaSelection()
if (!selection || selection.url !== url) {
logger.warn(`[ZimService] Wikipedia download complete callback for unknown URL: ${url}`)
return
}
if (success) {
// Update status to installed
selection.status = 'installed'
await selection.save()
logger.info(`[ZimService] Wikipedia download completed successfully: ${selection.filename}`)
// Delete the old Wikipedia file if it exists and is different
// We need to find what was previously installed
const existingFiles = await this.list()
const wikipediaFiles = existingFiles.files.filter((f) =>
f.name.startsWith('wikipedia_en_') && f.name !== selection.filename
)
for (const oldFile of wikipediaFiles) {
try {
await this.delete(oldFile.name)
logger.info(`[ZimService] Deleted old Wikipedia file: ${oldFile.name}`)
} catch (error) {
logger.warn(`[ZimService] Could not delete old Wikipedia file: ${oldFile.name}`, error)
}
}
} else {
// Download failed - keep the selection record but mark as failed
selection.status = 'failed'
await selection.save()
logger.error(`[ZimService] Wikipedia download failed for: ${selection.filename}`)
}
}
}