mirror of
https://github.com/Crosstalk-Solutions/project-nomad.git
synced 2026-03-28 03:29:25 +01:00
48 lines
1.0 KiB
TypeScript
48 lines
1.0 KiB
TypeScript
|
|
export const HTML_SELECTORS_TO_REMOVE = [
|
|
'script',
|
|
'style',
|
|
'nav',
|
|
'header',
|
|
'footer',
|
|
'noscript',
|
|
'iframe',
|
|
'svg',
|
|
'.navbox',
|
|
'.sidebar',
|
|
'.infobox',
|
|
'.mw-editsection',
|
|
'.reference',
|
|
'.reflist',
|
|
'.toc',
|
|
'.noprint',
|
|
'.mw-jump-link',
|
|
'.mw-headline-anchor',
|
|
'[role="navigation"]',
|
|
'.navbar',
|
|
'.hatnote',
|
|
'.ambox',
|
|
'.sistersitebox',
|
|
'.portal',
|
|
'#coordinates',
|
|
'.geo-nondefault',
|
|
'.authority-control',
|
|
]
|
|
|
|
// Common heading names that usually don't have meaningful content under them
|
|
export const NON_CONTENT_HEADING_PATTERNS = [
|
|
/^see also$/i,
|
|
/^references$/i,
|
|
/^external links$/i,
|
|
/^further reading$/i,
|
|
/^notes$/i,
|
|
/^bibliography$/i,
|
|
/^navigation$/i,
|
|
]
|
|
|
|
/**
|
|
* Batch size for processing ZIM articles to prevent lock timeout errors.
|
|
* Processing 50 articles at a time balances throughput with job duration.
|
|
* Typical processing time: 2-5 minutes per batch depending on article complexity.
|
|
*/
|
|
export const ZIM_BATCH_SIZE = 50 |