openai-clip-js

NOTE (Sept 7th 2023): At this point you may want to use Transformers.js instead since it’s well-maintained and supports quantized models which are much smaller. That said, if you don’t want to include the whole Transformers.js library in your app (as of writing I’m not sure if tree-shaking is supported yet), then you can still directly use ONNX Runtime Web with the quantized models produced by the Transformers.js conversion scripts.

Here are the relevant modules for Transformers.js:

Here’s a full working example that uses Transformers.js:

let quantized = false; // change to `true` for a much smaller model (e.g. 87mb vs 345mb for image model), but lower accuracy
let { AutoProcessor, CLIPVisionModelWithProjection, RawImage, AutoTokenizer, CLIPTextModelWithProjection } = await import('https://cdn.jsdelivr.net/npm/@xenova/transformers@2.5.4/dist/transformers.js');
let imageProcessor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch16');
let visionModel = await CLIPVisionModelWithProjection.from_pretrained('Xenova/clip-vit-base-patch16', {quantized});
let tokenizer = await AutoTokenizer.from_pretrained('Xenova/clip-vit-base-patch16');
let textModel = await CLIPTextModelWithProjection.from_pretrained('Xenova/clip-vit-base-patch16', {quantized});

function cosineSimilarity(A, B) {
  if(A.length !== B.length) throw new Error("A.length !== B.length");
  let dotProduct = 0, mA = 0, mB = 0;
  for(let i = 0; i < A.length; i++){
    dotProduct += A[i] * B[i];
    mA += A[i] * A[i];
    mB += B[i] * B[i];
  }
  mA = Math.sqrt(mA);
  mB = Math.sqrt(mB);
  let similarity = dotProduct / (mA * mB);
  return similarity;
}

// get image embedding:
let image = await RawImage.read('https://i.imgur.com/RKsLoNB.png');
let imageInputs = await imageProcessor(image);
let { image_embeds } = await visionModel(imageInputs);
console.log(image_embeds.data);

// get text embedding:
let texts = ['a photo of an astronaut'];
let textInputs = tokenizer(texts, { padding: true, truncation: true });
let { text_embeds } = await textModel(textInputs);
console.log(text_embeds.data);

let similarity = cosineSimilarity(image_embeds.data, text_embeds.data);
console.log(similarity);

Note that the above code uses clip-vit-base-patch16 instead of what’s used in this repo, clip-vit-base-patch32 - not sure which is best, but you can change patch16 to patch32 in the above code if you want to test it. Also note that you’ll see some GET/404 errors in the console - that’s expected, since Transformers.js tries to load models locally first. There’s probably a way to disable this.

Transformers.js also has a ton of other models available, and it’s quite easy to use. E.g. here’s an example of a text embedding / retrieval model:

let { pipeline } = await import('https://cdn.jsdelivr.net/npm/@xenova/transformers@2.5.4/dist/transformers.js');
let extractor = await pipeline('feature-extraction', 'Xenova/e5-large-v2');
let dotProduct = (vec1, vec2) => vec1.reduce((sum, val, i) => sum + val * vec2[i], 0);

let passage1 = await extractor('passage: She likes carrots and celery.', { pooling: 'mean', normalize: true });
let passage2 = await extractor('passage: This is a good calculus guide.', { pooling: 'mean', normalize: true });
let query = await extractor('query: Taking care of rabbits', { pooling: 'mean', normalize: true });

let similarity1 = dotProduct(query.data, passage1.data);
let similarity2 = dotProduct(query.data, passage2.data);

OpenAI CLIP JavaScript

OpenAI’s CLIP model ported to JavaScript using the ONNX web runtime. I also got the LiT models working here.

Minimal demos:

Example applications:

Server side:

Notes:

Todo (maybe):