Home Reference Source Test

packages/causality-preprocessing/src/Tokenizer/tokenizer.js

import { default as SentencePieceModel } from './sentencePiece';
import { jsonUtils } from 'causal-net.utils';
/**
 * This Tokenizer class is for tokenizer sentence
 * @experiment
 * @class Tokenizer
 * @example
 * [EXAMPLE ../../nlpPreprocessing.babel.js]
 */
class Tokenizer{
    constructor(){
        this.model = null;
        this.vocab = [];
    }
    async connect(link){
        this.vocab = await jsonUtils.queryJSON(link);
        this.model = new SentencePieceModel(this.vocab);
    }
    tokenize(text, asEncode=true){
        let wids =  this.encode(text);
        if(asEncode){
            return wids;
        }
        else{
            return wids.map(id=>this.vocab[id][0]);
        }
    }
    encode(text){
        if(!this.model){
            throw Error(`model is not loaded`);
        }
        return this.model.encode(text);
    }
}
var tokenizer = new Tokenizer();
export default tokenizer;