How to use Minstral with Node.js / transformers.js?

#52
by lancejpollard - opened

Doing this:

import 'dotenv/config'
import fs from 'fs/promises'
import {
  env,
  pipeline,
  AutoTokenizer,
  AutoModelForCausalLM,
} from '@xenova/transformers'

// env.allowRemoteModels = false
// env.localModelPath = './import/language/tibetan/models'
// # Use a pipeline as a high-level helper
// from transformers import pipeline

// pipe = pipeline("text-generation", model="meta-llama/Meta-Llama-3.1-8B")
async function summarizeDefinitions(definitions) {
  // Load the tokenizer
  const tokenizer = await AutoTokenizer.from_pretrained(
    'mistralai/Mistral-Nemo-Instruct-2407',
  )

  // Load the model
  const model = await AutoModelForCausalLM.from_pretrained(
    'mistralai/Mistral-Nemo-Instruct-2407',
  )

  const summarizer = await pipeline('text-generation', model, tokenizer)

  const cleanedDefinitions = {}

  let i = 0
  for (const term in definitions) {
    const defs = definitions[term]
    const combinedDefs = `Please summarize these definitions into a JSON array of simple ideally 1-3 word definitions: ${JSON.stringify(
      defs,
      null,
      2,
    )}`

    // Summarize the combined definitions
    const summary = await summarizer(combinedDefs, {
      max_length: 1000, // adjust length based on your requirements
      min_length: 1,
      do_sample: false,
    })

    console.log(summary)

    // Clean up the summary to create 1-3 word definitions
    const cleaned = summary[0].summary_text
      .split('.')
      .map(s => s.trim())
      .filter(s => s.length > 0)
      .map(s =>
        s
          .split(',')
          .map(ss => ss.trim())
          .filter(ss => ss.length <= 3),
      )

    cleanedDefinitions[term] = {
      definitions: cleaned.flat(),
      // type: 'noun', // or determine part-of-speech based on your logic
    }

    if (i === 100) {
      break
    }

    i++
  }

  return cleanedDefinitions
}

async function main() {
  const definitions = JSON.parse(
    await fs.readFile(
      `import/language/tibetan/definitions.out.json`,
      `utf-8`,
    ),
  )

  const cleanedDefinitions = await summarizeDefinitions(definitions)
  console.log(cleanedDefinitions)
}

main()

I get:

Error: Unauthorized access to file: "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407/resolve/main/tokenizer_config.json".

What do I need to do to get this working?

If I add an access token, I get this error now:

Error: Could not locate file: "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407/resolve/main/onnx/decoder_model_merged_quantized.onnx".

Any ideas?

Sign up or log in to comment