From 56c214be06a0b8332505e782b42072afc152596a Mon Sep 17 00:00:00 2001 From: Robert Gruen Date: Tue, 6 Jan 2026 11:04:48 -0800 Subject: [PATCH 1/6] Added ai generated readme --- python/fineTuning/unsloth/README.md | 106 ++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 python/fineTuning/unsloth/README.md diff --git a/python/fineTuning/unsloth/README.md b/python/fineTuning/unsloth/README.md new file mode 100644 index 000000000..690247121 --- /dev/null +++ b/python/fineTuning/unsloth/README.md @@ -0,0 +1,106 @@ +# Unsloth Fine-Tuning Tools + +This folder contains tools for fine-tuning language models using the [Unsloth](https://github.com/unslothai/unsloth) library, specifically focused on knowledge extraction tasks. + +## Overview + +These scripts enable training and inference of language models to extract structured knowledge (entities, actions, relationships) from conversational text, particularly NPR podcast transcripts. + +## Files + +### Training & Inference + +| File | Description | +|------|-------------| +| [trainEntities.py](trainEntities.py) | Fine-tunes a language model (e.g., Phi-4) using LoRA adapters to extract knowledge from text. Supports 4-bit quantization for efficient training. | +| [batchInfer.py](batchInfer.py) | Runs batch inference on a dataset using a trained model to extract knowledge structures. | +| [knowledgePrompt.py](knowledgePrompt.py) | Contains the prompt template that defines the knowledge extraction schema (entities, actions, facets). | + +### Keyword Extraction + +| File | Description | +|------|-------------| +| [nltkExtract.py](nltkExtract.py) | Extracts keywords from datasets using multiple methods: NLTK-RAKE, YAKE, KeyBERT, and spaCy. | +| [baseExtract.py](baseExtract.py) | Basic word extraction and analysis from dataset messages. | + +## Requirements + +- Python 3.10+ +- CUDA-capable GPU (recommended) +- Dependencies (install via pip): + ``` + unsloth + torch + transformers + datasets + rake-nltk + yake + keybert + spacy + sentence-transformers + ``` + +For spaCy, download the English model: +```bash +python -m spacy download en_core_web_sm +``` + +## Usage + +### Training a Model + +```bash +python trainEntities.py +``` + +This script: +1. Loads a pre-trained model (default: `unsloth/Phi-4`) +2. Applies LoRA adapters for efficient fine-tuning +3. Trains on knowledge extraction data +4. Saves the fine-tuned model to `/data/phi-4-lora-3200` + +### Batch Inference + +```bash +python batchInfer.py --model_path /data/phi-4-lora-3200 --dataset_path /data/dataset.json --output_file results.txt +``` + +Arguments: +- `--model_path`: Path to the fine-tuned model +- `--dataset_path`: Path to input JSON dataset +- `--output_file`: Path for output results + +### Keyword Extraction + +```bash +python nltkExtract.py --dataset_path /data/dataset.json --output_file extraction.txt --max_length 1 --verbose +``` + +Arguments: +- `--dataset_path`: Path to input JSON dataset +- `--max_length`: Maximum words in keyword phrases +- `--output_file`: Path for output results +- `--verbose`: Enable detailed output +- `--nogpu`: Force CPU usage instead of GPU + +## Knowledge Schema + +The knowledge extraction prompt defines the following TypeScript types: + +- **ConcreteEntity**: Named entities with types and facets +- **Action**: Verbs with subjects, objects, and tense +- **Facet**: Properties/attributes of entities +- **KnowledgeResponse**: Combined entities and actions + +## Model Support + +The training script supports various 4-bit quantized models including: +- Llama 3.1 (8B, 70B, 405B) +- Mistral (7B, Nemo 12B) +- Phi-3.5, Phi-4 +- Gemma 2 (9B, 27B) + +## License + +Copyright (c) Microsoft Corporation and Henry Lucco. +Licensed under the MIT License. From a591fa9d45a94b59a067cd6612dac0f9a773fdc4 Mon Sep 17 00:00:00 2001 From: Robert Gruen Date: Tue, 6 Jan 2026 11:09:39 -0800 Subject: [PATCH 2/6] added whitespace --- ts/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ts/README.md b/ts/README.md index eab68e535..af57ae343 100644 --- a/ts/README.md +++ b/ts/README.md @@ -19,7 +19,7 @@ To build: - Install pnpm (`npm i -g pnpm && pnpm setup`) - **(Linux/WSL Only)** Read TypeAgent Shell's [README.md](./packages/shell/README.md) for additional requirements -### Steps +### Steps In this directory: From 536260408d821209ecc518ae662958e9524d2fe8 Mon Sep 17 00:00:00 2001 From: Robert Gruen Date: Tue, 6 Jan 2026 11:29:33 -0800 Subject: [PATCH 3/6] linting now done as a separate job --- .github/workflows/build-ts.yml | 52 +++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build-ts.yml b/.github/workflows/build-ts.yml index c3ceb2c50..bde196815 100644 --- a/.github/workflows/build-ts.yml +++ b/.github/workflows/build-ts.yml @@ -8,7 +8,7 @@ name: build-ts on: workflow_dispatch: push: - branches: ["main"] + branches: ["main", "dev/robgruen/workflow_update"] pull_request: branches: ["main"] merge_group: @@ -27,7 +27,50 @@ env: NODE_OPTIONS: --max_old_space_size=8192 jobs: - build_ts: + linting: + strategy: + fail-fast: false + matrix: + os: ["ubuntu-latest", "windows-latest", "macos-latest"] + version: [20, 22] + + runs-on: ${{ matrix.os }} + steps: + - name: Setup Git LF + run: | + git config --global core.autocrlf false + - uses: actions/checkout@v4 + - uses: dorny/paths-filter@v3 + id: filter + continue-on-error: true + with: + filters: | + ts: + - "ts/**" + - ".github/workflows/build-ts.yml" + - uses: pnpm/action-setup@v4 + if: ${{ github.event_name != 'pull_request' || steps.filter.outputs.ts != 'false' }} + name: Install pnpm + with: + package_json_file: ts/package.json + - uses: actions/setup-node@v4 + if: ${{ github.event_name != 'pull_request' || steps.filter.outputs.ts != 'false' }} + with: + node-version: ${{ matrix.version }} + cache: "pnpm" + cache-dependency-path: ts/pnpm-lock.yaml + - name: Install dependencies + if: ${{ github.event_name != 'pull_request' || steps.filter.outputs.ts != 'false' }} + working-directory: ts + run: | + pnpm install --frozen-lockfile --strict-peer-dependencies + - name: Lint + if: ${{ github.event_name != 'pull_request' || steps.filter.outputs.ts != 'false' }} + working-directory: ts + run: | + npm run lint + + build_and_test: strategy: fail-fast: false matrix: @@ -72,11 +115,6 @@ jobs: working-directory: ts run: | npm run build - - name: Lint - if: ${{ github.event_name != 'pull_request' || steps.filter.outputs.ts != 'false' }} - working-directory: ts - run: | - npm run lint - name: Test if: ${{ github.event_name != 'pull_request' || steps.filter.outputs.ts != 'false' }} working-directory: ts From 1f11fd465a8c545c48ce2975b1048b4307800873 Mon Sep 17 00:00:00 2001 From: Robert Gruen Date: Tue, 6 Jan 2026 12:03:48 -0800 Subject: [PATCH 4/6] linting only on latest node version --- .github/workflows/build-ts.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-ts.yml b/.github/workflows/build-ts.yml index bde196815..e1e8415fb 100644 --- a/.github/workflows/build-ts.yml +++ b/.github/workflows/build-ts.yml @@ -32,7 +32,7 @@ jobs: fail-fast: false matrix: os: ["ubuntu-latest", "windows-latest", "macos-latest"] - version: [20, 22] + version: [22] runs-on: ${{ matrix.os }} steps: From 05f488950a395c7ea38e6620b1dd5942220681c3 Mon Sep 17 00:00:00 2001 From: Robert Gruen Date: Tue, 6 Jan 2026 12:20:25 -0800 Subject: [PATCH 5/6] package now depends on build and reuses the artifacts from build for a quicker completion. --- .github/workflows/build-package-shell.yml | 38 +++++++++-------------- .github/workflows/build-ts.yml | 9 ++++++ 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/.github/workflows/build-package-shell.yml b/.github/workflows/build-package-shell.yml index fb481a2bf..c589e598f 100644 --- a/.github/workflows/build-package-shell.yml +++ b/.github/workflows/build-package-shell.yml @@ -7,11 +7,10 @@ name: build-package-shell on: workflow_dispatch: - push: - branches: ["main"] - pull_request: - branches: ["main"] - merge_group: + workflow_run: + workflows: ["build-ts"] + types: + - completed branches: ["main"] concurrency: @@ -41,37 +40,28 @@ jobs: run: | git config --global core.autocrlf false - uses: actions/checkout@v4 - - uses: dorny/paths-filter@v3 - id: filter - continue-on-error: true - with: - filters: | - ts: - - "ts/**" - - ".github/workflows/build-package-shell.yml" - uses: pnpm/action-setup@v4 - if: ${{ github.event_name != 'pull_request' || steps.filter.outputs.ts != 'false' }} + if: ${{ github.event_name == 'workflow_dispatch' || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') }} name: Install pnpm with: package_json_file: ts/package.json - uses: actions/setup-node@v4 - if: ${{ github.event_name != 'pull_request' || steps.filter.outputs.ts != 'false' }} + if: ${{ github.event_name == 'workflow_dispatch' || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') }} with: node-version: ${{ matrix.version }} cache: "pnpm" cache-dependency-path: ts/pnpm-lock.yaml - name: Install dependencies - if: ${{ github.event_name != 'pull_request' || steps.filter.outputs.ts != 'false' }} + if: ${{ github.event_name == 'workflow_dispatch' || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') }} working-directory: ts run: | pnpm install --frozen-lockfile --strict-peer-dependencies - - name: Build - if: ${{ github.event_name != 'pull_request' || steps.filter.outputs.ts != 'false' }} - working-directory: ts - run: | - pnpm run build:shell - env: - DEBUG_DEMB: true + - name: Download build artifacts + if: ${{ github.event_name == 'workflow_dispatch' || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') }} + uses: actions/download-artifact@v4 + with: + name: ts-build-${{ matrix.os }}-node${{ matrix.version }} + path: ts # Reusing the cache can be flaky. If a downloaded archive is corrupted it can cause # problems because github tries to reuse the same machine for the next job leading to # blocked builds. This is a bug in electron-builder, it's not smart enough to retry acquiring the archive. @@ -87,7 +77,7 @@ jobs: # restore-keys: | # electron | ${{ runner.os }} | ${{ runner.arch }} - name: Package - shell - if: ${{ github.event_name != 'pull_request' || steps.filter.outputs.ts != 'false' }} + if: ${{ github.event_name == 'workflow_dispatch' || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') }} working-directory: ts shell: bash run: pnpm run shell:package diff --git a/.github/workflows/build-ts.yml b/.github/workflows/build-ts.yml index e1e8415fb..a40147ab0 100644 --- a/.github/workflows/build-ts.yml +++ b/.github/workflows/build-ts.yml @@ -115,6 +115,15 @@ jobs: working-directory: ts run: | npm run build + - uses: actions/upload-artifact@v4 + if: ${{ github.event_name != 'pull_request' || steps.filter.outputs.ts != 'false' }} + name: Upload build artifacts + with: + name: ts-build-${{ matrix.os }}-node${{ matrix.version }} + path: | + ts/packages/**/dist/** + ts/packages/**/lib/** + retention-days: 7 - name: Test if: ${{ github.event_name != 'pull_request' || steps.filter.outputs.ts != 'false' }} working-directory: ts From 8ee76239d6345734db583031caf717e3fe50099c Mon Sep 17 00:00:00 2001 From: Robert Gruen Date: Tue, 6 Jan 2026 12:34:41 -0800 Subject: [PATCH 6/6] tuning artifact parameters --- .github/workflows/build-ts.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-ts.yml b/.github/workflows/build-ts.yml index a40147ab0..aead196b3 100644 --- a/.github/workflows/build-ts.yml +++ b/.github/workflows/build-ts.yml @@ -120,10 +120,11 @@ jobs: name: Upload build artifacts with: name: ts-build-${{ matrix.os }}-node${{ matrix.version }} + retention-days: 1 + compression-level: 0 path: | ts/packages/**/dist/** ts/packages/**/lib/** - retention-days: 7 - name: Test if: ${{ github.event_name != 'pull_request' || steps.filter.outputs.ts != 'false' }} working-directory: ts