Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions .eslintignore

This file was deleted.

23 changes: 0 additions & 23 deletions .eslintrc.json

This file was deleted.

2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,5 @@ temp/

# Optional REPL history
.node_repl_history

.gemini
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
## 4.3.1 (2025-07-05)

**Security Fixes**

- **Path Traversal:** Patched a path traversal vulnerability by ensuring all file processing occurs within a temporary directory. This prevents attackers from accessing or manipulating files outside of the intended directory.
- **Cross-Site Scripting (XSS):** Implemented HTML sanitization using `dompurify` to prevent potential XSS attacks from malicious PDF files. This ensures that any HTML generated by the package is safe to render in a browser.
- **Dependency Vulnerabilities:** Updated the `brace-expansion` dependency to resolve a low-severity vulnerability.
11 changes: 3 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,6 @@ class PDFProcessingError extends Error {

Full IntelliSense support in VS Code and other TypeScript-aware editors:

![TypeScript IntelliSense](https://via.placeholder.com/600x200?text=IntelliSense+Demo)

- Auto-completion for all methods and options
- Inline documentation on hover
- Type checking at compile time
Expand Down Expand Up @@ -373,10 +371,10 @@ If automatic download fails (e.g., due to network restrictions), you can manuall
cd node_modules/pdf2html/vendor

# Download Apache PDFBox
wget https://archive.apache.org/dist/pdfbox/2.0.33/pdfbox-app-2.0.33.jar
wget https://archive.apache.org/dist/pdfbox/2.0.34/pdfbox-app-2.0.34.jar

# Download Apache Tika
wget https://archive.apache.org/dist/tika/3.1.0/tika-app-3.1.0.jar
wget https://archive.apache.org/dist/tika/3.2.0/tika-app-3.2.0.jar
```

3. Verify the files are in place:
Expand All @@ -390,19 +388,16 @@ If automatic download fails (e.g., due to network restrictions), you can manuall
### Common Issues

1. **"Java is not installed"**

- Install Java JRE 8 or higher
- Ensure `java` is in your system PATH
- Verify with: `java -version`

2. **"File not found" errors**

- Check that the PDF path is correct
- Use absolute paths for better reliability
- Ensure the file has read permissions

3. **"Buffer size exceeded"**

- Increase maxBuffer option
- Process large PDFs page by page
- Consider splitting very large PDFs
Expand Down Expand Up @@ -441,7 +436,7 @@ This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENS

## 📊 Dependencies

- **Production**: Apache Tika 3.1.0, Apache PDFBox 2.0.33
- **Production**: Apache Tika 3.2.0, Apache PDFBox 2.0.34
- **Development**: See package.json for development dependencies

---
Expand Down
4 changes: 2 additions & 2 deletions constants.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
const path = require('path');

module.exports = {
VENDOR_PDF_BOX_JAR: 'pdfbox-app-2.0.33.jar',
VENDOR_TIKA_JAR: 'tika-app-3.1.0.jar',
VENDOR_PDF_BOX_JAR: 'pdfbox-app-2.0.34.jar',
VENDOR_TIKA_JAR: 'tika-app-3.2.0.jar',

DIRECTORY: {
PDF: path.join(__dirname, './files/pdf/'),
Expand Down
30 changes: 30 additions & 0 deletions eslint.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
const globals = require('globals');
const js = require('@eslint/js');

const prettierPlugin = require('eslint-plugin-prettier');
const prettierConfig = require('eslint-config-prettier');

module.exports = [
js.configs.recommended,

prettierConfig,
{
files: ['**/*.js'],
languageOptions: {
ecmaVersion: 'latest',
sourceType: 'commonjs',
globals: {
...globals.browser,
...globals.node,
...globals.mocha,
},
},
plugins: {
prettier: prettierPlugin,
},
rules: {
'prettier/prettier': 'error',
'no-console': 'off',
},
},
];
7 changes: 6 additions & 1 deletion lib/HTMLParser.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
// lib/HTMLParser.js
const cheerio = require('cheerio');
const { JSDOM } = require('jsdom');
const DOMPurify = require('dompurify');

const window = new JSDOM('').window;
const purify = DOMPurify(window);

/**
* HTML content parser
Expand All @@ -11,7 +16,7 @@ class HTMLParser {

$('.page').each((index, element) => {
const $page = $(element);
const content = options.text ? $page.text().trim() : $page.html();
const content = options.text ? $page.text().trim() : purify.sanitize($page.html());
pages.push(content);
});

Expand Down
22 changes: 1 addition & 21 deletions lib/PDFBoxWrapper.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ const debug = require('debug')('pdf2html');
const path = require('path');
const fse = require('fs-extra');
const defaults = require('lodash.defaults');
const URI = require('urijs');

const CommandExecutor = require('./CommandExecutor');
const ImageProcessor = require('./ImageProcessor');
const FileManager = require('./FileManager');
Expand All @@ -16,27 +16,7 @@ const { DEFAULT_OPTIONS } = require('./config');
class PDFBoxWrapper {
static async generateImage(filepath, options) {
const opts = defaults(options, DEFAULT_OPTIONS.thumbnail);
const uri = new URI(filepath);

// Check if the filepath is already in the temp directory
const isInTempDir = filepath.includes(constants.DIRECTORY.PDF);

if (isInTempDir) {
// File is already in the temp directory, process it directly
// Generate image using PDFBox
await this.executePDFBox(filepath, opts);

// Determine file paths
const pdfBoxImagePath = this.getPDFBoxImagePath(filepath, opts);
const finalImagePath = path.join(constants.DIRECTORY.IMAGE, uri.filename().replace(uri.suffix(), opts.imageType));

// Process the generated image
await this.processGeneratedImage(pdfBoxImagePath, finalImagePath, opts);

return finalImagePath;
}

// Use the original withTempFile logic for non-temp files
return FileManager.withTempFile(filepath, constants.DIRECTORY.PDF, async (tempFilePath, tempUri) => {
// Generate image using PDFBox
await this.executePDFBox(tempFilePath, opts);
Expand Down
9 changes: 6 additions & 3 deletions lib/TikaWrapper.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,19 @@ const path = require('path');
const CommandExecutor = require('./CommandExecutor');
const { DEFAULT_OPTIONS } = require('./config');
const constants = require('../constants');
const FileManager = require('./FileManager');

/**
* Apache Tika wrapper for content extraction
*/
class TikaWrapper {
static async extract(filepath, format, options = {}) {
const args = ['-jar', path.join(constants.DIRECTORY.VENDOR, constants.VENDOR_TIKA_JAR), `--${format}`, filepath];
return FileManager.withTempFile(filepath, constants.DIRECTORY.PDF, async (tempFilePath) => {
const args = ['-jar', path.join(constants.DIRECTORY.VENDOR, constants.VENDOR_TIKA_JAR), `--${format}`, tempFilePath];

const maxBuffer = options.maxBuffer || DEFAULT_OPTIONS.command.maxBuffer;
return CommandExecutor.execute('java', args, { maxBuffer });
const maxBuffer = options.maxBuffer || DEFAULT_OPTIONS.command.maxBuffer;
return CommandExecutor.execute('java', args, { maxBuffer });
});
}

static async extractHTML(filepath, options) {
Expand Down
Loading