shebinleo · shebinleo · Jul 5, 2025 · Jul 5, 2025 · Jul 5, 2025
diff --git a/.eslintignore b/.eslintignore
diff --git a/.eslintrc.json b/.eslintrc.json
diff --git a/.gitignore b/.gitignore
@@ -57,3 +57,5 @@ temp/
 
 # Optional REPL history
 .node_repl_history
+
+.gemini
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,7 @@
+## 4.3.1 (2025-07-05)
+
+**Security Fixes**
+
+- **Path Traversal:** Patched a path traversal vulnerability by ensuring all file processing occurs within a temporary directory. This prevents attackers from accessing or manipulating files outside of the intended directory.
+- **Cross-Site Scripting (XSS):** Implemented HTML sanitization using `dompurify` to prevent potential XSS attacks from malicious PDF files. This ensures that any HTML generated by the package is safe to render in a browser.
+- **Dependency Vulnerabilities:** Updated the `brace-expansion` dependency to resolve a low-severity vulnerability.
diff --git a/README.md b/README.md
@@ -224,8 +224,6 @@ class PDFProcessingError extends Error {
 
 Full IntelliSense support in VS Code and other TypeScript-aware editors:
 
-![TypeScript IntelliSense](https://via.placeholder.com/600x200?text=IntelliSense+Demo)
-
 - Auto-completion for all methods and options
 - Inline documentation on hover
 - Type checking at compile time
@@ -373,10 +371,10 @@ If automatic download fails (e.g., due to network restrictions), you can manuall
     cd node_modules/pdf2html/vendor
 
     # Download Apache PDFBox
-    wget https://archive.apache.org/dist/pdfbox/2.0.33/pdfbox-app-2.0.33.jar
+    wget https://archive.apache.org/dist/pdfbox/2.0.34/pdfbox-app-2.0.34.jar
 
     # Download Apache Tika
-    wget https://archive.apache.org/dist/tika/3.1.0/tika-app-3.1.0.jar
+    wget https://archive.apache.org/dist/tika/3.2.0/tika-app-3.2.0.jar
     ```
 
 3. Verify the files are in place:
@@ -390,19 +388,16 @@ If automatic download fails (e.g., due to network restrictions), you can manuall
 ### Common Issues
 
 1. **"Java is not installed"**
-
     - Install Java JRE 8 or higher
     - Ensure `java` is in your system PATH
     - Verify with: `java -version`
 
 2. **"File not found" errors**
-
     - Check that the PDF path is correct
     - Use absolute paths for better reliability
     - Ensure the file has read permissions
 
 3. **"Buffer size exceeded"**
-
     - Increase maxBuffer option
     - Process large PDFs page by page
     - Consider splitting very large PDFs
@@ -441,7 +436,7 @@ This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENS
 
 ## 📊 Dependencies
 
-- **Production**: Apache Tika 3.1.0, Apache PDFBox 2.0.33
+- **Production**: Apache Tika 3.2.0, Apache PDFBox 2.0.34
 - **Development**: See package.json for development dependencies
 
 ---

diff --git a/constants.js b/constants.js
@@ -1,8 +1,8 @@
 const path = require('path');
 
 module.exports = {
-    VENDOR_PDF_BOX_JAR: 'pdfbox-app-2.0.33.jar',
-    VENDOR_TIKA_JAR: 'tika-app-3.1.0.jar',
+    VENDOR_PDF_BOX_JAR: 'pdfbox-app-2.0.34.jar',
+    VENDOR_TIKA_JAR: 'tika-app-3.2.0.jar',
 
     DIRECTORY: {
         PDF: path.join(__dirname, './files/pdf/'),

diff --git a/eslint.config.js b/eslint.config.js
@@ -0,0 +1,30 @@
+const globals = require('globals');
+const js = require('@eslint/js');
+
+const prettierPlugin = require('eslint-plugin-prettier');
+const prettierConfig = require('eslint-config-prettier');
+
+module.exports = [
+    js.configs.recommended,
+
+    prettierConfig,
+    {
+        files: ['**/*.js'],
+        languageOptions: {
+            ecmaVersion: 'latest',
+            sourceType: 'commonjs',
+            globals: {
+                ...globals.browser,
+                ...globals.node,
+                ...globals.mocha,
+            },
+        },
+        plugins: {
+            prettier: prettierPlugin,
+        },
+        rules: {
+            'prettier/prettier': 'error',
+            'no-console': 'off',
+        },
+    },
+];
diff --git a/lib/HTMLParser.js b/lib/HTMLParser.js
@@ -1,5 +1,10 @@
 // lib/HTMLParser.js
 const cheerio = require('cheerio');
+const { JSDOM } = require('jsdom');
+const DOMPurify = require('dompurify');
+
+const window = new JSDOM('').window;
+const purify = DOMPurify(window);
 
 /**
  * HTML content parser
@@ -11,7 +16,7 @@ class HTMLParser {
 
         $('.page').each((index, element) => {
             const $page = $(element);
-            const content = options.text ? $page.text().trim() : $page.html();
+            const content = options.text ? $page.text().trim() : purify.sanitize($page.html());
             pages.push(content);
         });
 

diff --git a/lib/PDFBoxWrapper.js b/lib/PDFBoxWrapper.js
@@ -3,7 +3,7 @@ const debug = require('debug')('pdf2html');
 const path = require('path');
 const fse = require('fs-extra');
 const defaults = require('lodash.defaults');
-const URI = require('urijs');
+
 const CommandExecutor = require('./CommandExecutor');
 const ImageProcessor = require('./ImageProcessor');
 const FileManager = require('./FileManager');
@@ -16,27 +16,7 @@ const { DEFAULT_OPTIONS } = require('./config');
 class PDFBoxWrapper {
     static async generateImage(filepath, options) {
         const opts = defaults(options, DEFAULT_OPTIONS.thumbnail);
-        const uri = new URI(filepath);
-
-        // Check if the filepath is already in the temp directory
-        const isInTempDir = filepath.includes(constants.DIRECTORY.PDF);
-
-        if (isInTempDir) {
-            // File is already in the temp directory, process it directly
-            // Generate image using PDFBox
-            await this.executePDFBox(filepath, opts);
-
-            // Determine file paths
-            const pdfBoxImagePath = this.getPDFBoxImagePath(filepath, opts);
-            const finalImagePath = path.join(constants.DIRECTORY.IMAGE, uri.filename().replace(uri.suffix(), opts.imageType));
-
-            // Process the generated image
-            await this.processGeneratedImage(pdfBoxImagePath, finalImagePath, opts);
-
-            return finalImagePath;
-        }
 
-        // Use the original withTempFile logic for non-temp files
         return FileManager.withTempFile(filepath, constants.DIRECTORY.PDF, async (tempFilePath, tempUri) => {
             // Generate image using PDFBox
             await this.executePDFBox(tempFilePath, opts);

diff --git a/lib/TikaWrapper.js b/lib/TikaWrapper.js
@@ -4,16 +4,19 @@ const path = require('path');
 const CommandExecutor = require('./CommandExecutor');
 const { DEFAULT_OPTIONS } = require('./config');
 const constants = require('../constants');
+const FileManager = require('./FileManager');
 
 /**
  * Apache Tika wrapper for content extraction
  */
 class TikaWrapper {
     static async extract(filepath, format, options = {}) {
-        const args = ['-jar', path.join(constants.DIRECTORY.VENDOR, constants.VENDOR_TIKA_JAR), `--${format}`, filepath];
+        return FileManager.withTempFile(filepath, constants.DIRECTORY.PDF, async (tempFilePath) => {
+            const args = ['-jar', path.join(constants.DIRECTORY.VENDOR, constants.VENDOR_TIKA_JAR), `--${format}`, tempFilePath];
 
-        const maxBuffer = options.maxBuffer || DEFAULT_OPTIONS.command.maxBuffer;
-        return CommandExecutor.execute('java', args, { maxBuffer });
+            const maxBuffer = options.maxBuffer || DEFAULT_OPTIONS.command.maxBuffer;
+            return CommandExecutor.execute('java', args, { maxBuffer });
+        });
     }
 
     static async extractHTML(filepath, options) {
Original file line number	Diff line number	Diff line change
Expand Up		@@ -57,3 +57,5 @@ temp/

		# Optional REPL history
		.node_repl_history

		.gemini