Skip to content

Commit

Permalink
can run in browser
Browse files Browse the repository at this point in the history
  • Loading branch information
aeltorio committed Oct 5, 2024
1 parent 92ccbd0 commit 865cb6b
Show file tree
Hide file tree
Showing 9 changed files with 107 additions and 14 deletions.
25 changes: 24 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ yarn build

## Use

To use this tool
To use this tool in nodejs, you can use the following code:

```js

Expand All @@ -44,6 +44,29 @@ main()

```

In the browser, you can use the following code:

```js
import { SentencePieceProcessor, cleanText, llama_3_1_tokeniser_b64 } from "@sctg/sentencepiece-js";
// built in models: llama_3_1_tokeniser_b64, clean_30k_b64, smart_b64
async function main() {

let text = "I am still waiting on my card?"
let cleaned = cleanText(text)

let spp = new SentencePieceProcessor()
await spp.lloadFromB64StringModel(llama_3_1_tokeniser_b64);
let ids = spp.encodeIds(cleaned)
console.log(ids)
let str = spp.decodeIds(ids) // list ids->number
console.log(str)

let pieces = spp.encodePieces(cleaned) // list tokens->string
console.log(pieces)
}
main()
```

## Note

- devilyouwei updated this repo to make this module support the js `require` keyword and added the using example.
Expand Down
8 changes: 4 additions & 4 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@sctg/sentencepiece-js",
"version": "1.2.0",
"version": "1.3.0",
"description": "Sentencepiece tokenization for natural language processing, JS version.",
"main": "dist/index.js",
"exports": {
Expand All @@ -11,12 +11,12 @@
"build": "./build.sh; rollup --config",
"test": "web-test-runner \"test/**/*.test.js\" \"src/**/*.test.js\" --node-resolve",
"test:watch": "web-test-runner \"test/**/*.test.js\" \"src/**/*.test.js\" --node-resolve --watch",
"develop": "web-dev-server --node-resolve --watch --open"
"develop": "web-dev-server --node-resolve --watch --open",
"convert_models": "node ./test/convertmodels.js"
},
"files": [
"./dist/index.js",
"./dist/index.d.ts",
"./dist/llama-3.1-tokenizer.model"
"./dist/index.d.ts"
],
"repository": {
"type": "git",
Expand Down
1 change: 1 addition & 0 deletions src/clean_30k.ts

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions src/index.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import { SentencePieceProcessor, cleanText } from "./sentencePieceProcessor";
import { llama_3_1_tokeniser_b64 } from "./llama_3_1_tokeniser_model";
import { smart_b64 } from "./smart";
import { clean_30k_b64 } from "./clean_30k";

export { SentencePieceProcessor, cleanText }
export default { SentencePieceProcessor, cleanText }
export { SentencePieceProcessor, cleanText, llama_3_1_tokeniser_b64, clean_30k_b64, smart_b64 };
export default { SentencePieceProcessor, cleanText };
1 change: 1 addition & 0 deletions src/llama_3_1_tokeniser_model.ts

Large diffs are not rendered by default.

31 changes: 25 additions & 6 deletions src/sentencePieceProcessor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,34 @@ export class SentencePieceProcessor {
processor: any;
sentencepiece: any;

uuidv4(): string {
return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function (c) {
var r = Math.random() * 16 | 0,
v = c == 'x' ? r : (r & 0x3 | 0x8);
return v.toString(16);
});
}

// load model from a base64 encoded string
async loadFromB64StringModel(b64model: string) {
// decode base64 string
const model = Buffer.from(b64model, 'base64');
await this._loadModel(model);
}

// load model
async load(url: string) {
const model = fs.readFileSync(url);
await this._loadModel(model);
}

this.sentencepiece = await Module();

// change to fs read model file
this.sentencepiece.FS.writeFile("sentencepiece.model", fs.readFileSync(url));
const string_view = new this.sentencepiece.StringView("sentencepiece.model");
// private function to load model
private async _loadModel(model: Buffer) {
const tempName = this.uuidv4() + ".model";
this.sentencepiece = await Module();
this.sentencepiece.FS.writeFile(tempName, model);
const string_view = new this.sentencepiece.StringView(tempName);
const absl_string_view = string_view.getView();

this.processor = new this.sentencepiece.SentencePieceProcessor();
Expand All @@ -22,10 +42,9 @@ export class SentencePieceProcessor {
load_status.delete();
absl_string_view.delete();
string_view.delete();

this.sentencepiece.FS.unlink(tempName);
}


encodeIds(text: string) {

const string_view = new this.sentencepiece.StringView(text);
Expand Down
1 change: 1 addition & 0 deletions src/smart.ts

Large diffs are not rendered by default.

14 changes: 13 additions & 1 deletion src/test.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
const { SentencePieceProcessor, cleanText } = require("../dist");
const { SentencePieceProcessor, cleanText, llama_3_1_tokeniser_b64 } = require("../dist");
const ROOT = require('app-root-path')

async function main() {
Expand All @@ -10,10 +10,22 @@ async function main() {
await spp.load(`${ROOT}/test/llama-3.1-tokenizer.model`)
let ids = spp.encodeIds(cleaned);
console.log(ids)
console.log(`Token length: ${ids.length}`)
let str = spp.decodeIds(ids)
console.log(str)

let pieces = spp.encodePieces(cleaned);
console.log(pieces)

let spp2 = new SentencePieceProcessor();
await spp2.loadFromB64StringModel(llama_3_1_tokeniser_b64);
let ids2 = spp2.encodeIds(cleaned);
console.log(ids2)
console.log(`Token length: ${ids2.length}`)
let str2 = spp2.decodeIds(ids2)
console.log(str2)

let pieces2 = spp2.encodePieces(cleaned);
console.log(pieces2);
}
main()
33 changes: 33 additions & 0 deletions test/convertmodels.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
const fs = require('fs');
const path = require('path');

function convertMoldel(filePath, output, variableName) {
fs.readFile(filePath, (err, data) => {
if (err) {
console.error('Erreur lors de la lecture du fichier binaire:', err);
return;
}

// Convertir le contenu en base64
const base64Content = data.toString('base64');

// Créer la chaîne de caractères contenant l'exportation de la constante
const outputContent = `export const ${variableName} = "${base64Content}";`;

// Chemin vers le fichier de sortie
const outputFilePath = path.join(__dirname, output);

// Écrire la chaîne dans le fichier de sortie
fs.writeFile(outputFilePath, outputContent, (err) => {
if (err) {
console.error('Erreur lors de l\'écriture du fichier de sortie:', err);
return;
}
console.log('Fichier de sortie généré avec succès:', outputFilePath);
});
});
}

convertMoldel('./test/llama-3.1-tokenizer.model', '../src/llama_3_1_tokeniser_model.ts', 'llama_3_1_tokeniser_b64');
convertMoldel('./test/30k-clean.model', '../src/clean_30k.ts', 'clean_30k_b64');
convertMoldel('./test/smart.model', '../src/smart.ts', 'smart_b64');

0 comments on commit 865cb6b

Please sign in to comment.