From 4afb9267410e576ede69f48cfe0b49e99b306560 Mon Sep 17 00:00:00 2001 From: Chris Sherwood Date: Wed, 18 Mar 2026 16:52:34 -0700 Subject: [PATCH] fix: improve download reliability with stall detection, failure visibility, and Wikipedia status tracking Three bugs caused downloads to hang, disappear, or leave stuck spinners: 1. Wikipedia downloads that failed never updated the DB status from 'downloading', leaving the spinner stuck forever. Now the worker's failed handler marks them as failed. 2. No stall detection on streaming downloads - if data stopped flowing mid-download, the job hung indefinitely. Added a 5-minute stall timer that triggers retry. 3. Failed jobs were invisible to users since only waiting/active/delayed states were queried. Now failed jobs appear with error indicators in the download list. Closes #364, closes #216 Co-Authored-By: Claude Opus 4.6 (1M context) --- admin/app/services/download_service.ts | 16 +++++-- admin/app/utils/downloads.ts | 24 +++++++++- admin/commands/queue/work.ts | 17 ++++++- admin/inertia/components/ActiveDownloads.tsx | 46 ++++++++++++++----- .../inertia/components/WikipediaSelector.tsx | 27 +++++++++-- admin/types/downloads.ts | 2 + 6 files changed, 111 insertions(+), 21 deletions(-) diff --git a/admin/app/services/download_service.ts b/admin/app/services/download_service.ts index b5db238..63c7ecd 100644 --- a/admin/app/services/download_service.ts +++ b/admin/app/services/download_service.ts @@ -12,7 +12,7 @@ export class DownloadService { async listDownloadJobs(filetype?: string): Promise { // Get regular file download jobs (zim, map, etc.) const queue = this.queueService.getQueue(RunDownloadJob.queue) - const fileJobs = await queue.getJobs(['waiting', 'active', 'delayed']) + const fileJobs = await queue.getJobs(['waiting', 'active', 'delayed', 'failed']) const fileDownloads = fileJobs.map((job) => ({ jobId: job.id!.toString(), @@ -20,11 +20,13 @@ export class DownloadService { progress: parseInt(job.progress.toString(), 10), filepath: normalize(job.data.filepath), filetype: job.data.filetype, + status: (job.failedReason ? 'failed' : 'active') as 'active' | 'failed', + failedReason: job.failedReason || undefined, })) // Get Ollama model download jobs const modelQueue = this.queueService.getQueue(DownloadModelJob.queue) - const modelJobs = await modelQueue.getJobs(['waiting', 'active', 'delayed']) + const modelJobs = await modelQueue.getJobs(['waiting', 'active', 'delayed', 'failed']) const modelDownloads = modelJobs.map((job) => ({ jobId: job.id!.toString(), @@ -32,6 +34,8 @@ export class DownloadService { progress: parseInt(job.progress.toString(), 10), filepath: job.data.modelName || 'Unknown Model', // Use model name as filepath filetype: 'model', + status: (job.failedReason ? 'failed' : 'active') as 'active' | 'failed', + failedReason: job.failedReason || undefined, })) const allDownloads = [...fileDownloads, ...modelDownloads] @@ -39,7 +43,11 @@ export class DownloadService { // Filter by filetype if specified const filtered = allDownloads.filter((job) => !filetype || job.filetype === filetype) - // Sort so actively downloading items (progress > 0) appear first, then by progress descending - return filtered.sort((a, b) => b.progress - a.progress) + // Sort: active downloads first (by progress desc), then failed at the bottom + return filtered.sort((a, b) => { + if (a.status === 'failed' && b.status !== 'failed') return 1 + if (a.status !== 'failed' && b.status === 'failed') return -1 + return b.progress - a.progress + }) } } diff --git a/admin/app/utils/downloads.ts b/admin/app/utils/downloads.ts index 7c36378..1c26a74 100644 --- a/admin/app/utils/downloads.ts +++ b/admin/app/utils/downloads.ts @@ -88,10 +88,29 @@ export async function doResumableDownload({ let lastProgressTime = Date.now() let lastDownloadedBytes = startByte + // Stall detection: if no data arrives for 5 minutes, abort the download + const STALL_TIMEOUT_MS = 5 * 60 * 1000 + let stallTimer: ReturnType | null = null + + const clearStallTimer = () => { + if (stallTimer) { + clearTimeout(stallTimer) + stallTimer = null + } + } + + const resetStallTimer = () => { + clearStallTimer() + stallTimer = setTimeout(() => { + cleanup(new Error('Download stalled - no data received for 5 minutes')) + }, STALL_TIMEOUT_MS) + } + // Progress tracking stream to monitor data flow const progressStream = new Transform({ transform(chunk: Buffer, _: any, callback: Function) { downloadedBytes += chunk.length + resetStallTimer() // Update progress tracking const now = Date.now() @@ -118,6 +137,7 @@ export async function doResumableDownload({ // Handle errors and cleanup const cleanup = (error?: Error) => { + clearStallTimer() progressStream.destroy() response.data.destroy() writeStream.destroy() @@ -136,6 +156,7 @@ export async function doResumableDownload({ }) writeStream.on('finish', async () => { + clearStallTimer() if (onProgress) { onProgress({ downloadedBytes, @@ -151,7 +172,8 @@ export async function doResumableDownload({ resolve(filepath) }) - // Pipe: response -> progressStream -> writeStream + // Start stall timer and pipe: response -> progressStream -> writeStream + resetStallTimer() response.data.pipe(progressStream).pipe(writeStream) }) } diff --git a/admin/commands/queue/work.ts b/admin/commands/queue/work.ts index e39fdbf..453268d 100644 --- a/admin/commands/queue/work.ts +++ b/admin/commands/queue/work.ts @@ -65,8 +65,23 @@ export default class QueueWork extends BaseCommand { } ) - worker.on('failed', (job, err) => { + worker.on('failed', async (job, err) => { this.logger.error(`[${queueName}] Job failed: ${job?.id}, Error: ${err.message}`) + + // If this was a Wikipedia download, mark it as failed in the DB + if (job?.data?.filetype === 'zim' && job?.data?.url?.includes('wikipedia_en_')) { + try { + const { DockerService } = await import('#services/docker_service') + const { ZimService } = await import('#services/zim_service') + const dockerService = new DockerService() + const zimService = new ZimService(dockerService) + await zimService.onWikipediaDownloadComplete(job.data.url, false) + } catch (e: any) { + this.logger.error( + `[${queueName}] Failed to update Wikipedia status: ${e.message}` + ) + } + } }) worker.on('completed', (job) => { diff --git a/admin/inertia/components/ActiveDownloads.tsx b/admin/inertia/components/ActiveDownloads.tsx index 1319aaa..69bbb8f 100644 --- a/admin/inertia/components/ActiveDownloads.tsx +++ b/admin/inertia/components/ActiveDownloads.tsx @@ -2,6 +2,7 @@ import useDownloads, { useDownloadsProps } from '~/hooks/useDownloads' import HorizontalBarChart from './HorizontalBarChart' import { extractFileName } from '~/lib/util' import StyledSectionHeader from './StyledSectionHeader' +import { IconAlertTriangle } from '@tabler/icons-react' interface ActiveDownloadProps { filetype?: useDownloadsProps['filetype'] @@ -17,18 +18,39 @@ const ActiveDownloads = ({ filetype, withHeader = false }: ActiveDownloadProps)
{downloads && downloads.length > 0 ? ( downloads.map((download) => ( -
- +
+ {download.status === 'failed' ? ( +
+ +
+

+ {extractFileName(download.filepath) || download.url} +

+

+ Download failed{download.failedReason ? `: ${download.failedReason}` : ''} +

+
+
+ ) : ( + + )}
)) ) : ( diff --git a/admin/inertia/components/WikipediaSelector.tsx b/admin/inertia/components/WikipediaSelector.tsx index 8e29dd6..7587aa2 100644 --- a/admin/inertia/components/WikipediaSelector.tsx +++ b/admin/inertia/components/WikipediaSelector.tsx @@ -1,7 +1,7 @@ import { formatBytes } from '~/lib/util' import { WikipediaOption, WikipediaCurrentSelection } from '../../types/downloads' import classNames from 'classnames' -import { IconCheck, IconDownload, IconWorld } from '@tabler/icons-react' +import { IconCheck, IconDownload, IconWorld, IconAlertTriangle } from '@tabler/icons-react' import StyledButton from './StyledButton' import LoadingSpinner from './LoadingSpinner' @@ -29,8 +29,9 @@ const WikipediaSelector: React.FC = ({ // Determine which option to highlight const highlightedOptionId = selectedOptionId ?? currentSelection?.optionId ?? null - // Check if current selection is downloading + // Check if current selection is downloading or failed const isDownloading = currentSelection?.status === 'downloading' + const isFailed = currentSelection?.status === 'failed' return (
@@ -55,6 +56,18 @@ const WikipediaSelector: React.FC = ({
)} + {/* Failed status message */} + {isFailed && ( +
+
+ + + Wikipedia download failed. Select a package and try again. + +
+
+ )} + {/* Options grid */}
{options.map((option) => { @@ -63,6 +76,8 @@ const WikipediaSelector: React.FC = ({ currentSelection?.optionId === option.id && currentSelection?.status === 'installed' const isCurrentDownloading = currentSelection?.optionId === option.id && currentSelection?.status === 'downloading' + const isCurrentFailed = + currentSelection?.optionId === option.id && currentSelection?.status === 'failed' const isPending = selectedOptionId === option.id && selectedOptionId !== currentSelection?.optionId return ( @@ -100,6 +115,12 @@ const WikipediaSelector: React.FC = ({ Downloading )} + {isCurrentFailed && ( + + + Failed + + )}
{/* Option content */} @@ -136,7 +157,7 @@ const WikipediaSelector: React.FC = ({
{/* Submit button for Content Explorer mode */} - {showSubmitButton && selectedOptionId && selectedOptionId !== currentSelection?.optionId && ( + {showSubmitButton && selectedOptionId && (selectedOptionId !== currentSelection?.optionId || isFailed) && (