Automatically add EOS via Tokenizer, add Sentence Transformers snippet

#2
by tomaarsen HF Staff - opened
Files changed (2) hide show
  1. README.md +52 -7
  2. tokenizer.json +2 -2
README.md CHANGED
@@ -5,9 +5,11 @@ datasets:
5
  - codefuse-ai/F2LLM
6
  language:
7
  - en
 
 
8
  license: apache-2.0
9
  pipeline_tag: feature-extraction
10
- library_name: transformers
11
  ---
12
 
13
  # F2LLM Technical Report: Matching SOTA Embedding Performance with 6 Million Open-Source Data
@@ -18,7 +20,38 @@ F2LLMs (Foundation to Feature Large Language Models) are foundation models direc
18
 
19
  ## Usage
20
 
21
- To encode a batch of sentences:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  ```python
24
  from transformers import AutoModel, AutoTokenizer
@@ -30,22 +63,34 @@ model_path = "codefuse-ai/F2LLM-0.6B"
30
  tokenizer = AutoTokenizer.from_pretrained(model_path)
31
  model = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, device_map={'': 0})
32
 
33
- sentences = [
 
 
34
  'We present F2LLM, a family of fully open embedding LLMs that achieve a strong balance between model size, training data, and embedding performance.',
35
- 'Model checkpoints, training datasets, and training code are released, positioning F2LLM as a strong, reproducible, and budget-friendly baseline for future research in text embedding models.'
 
36
  ]
37
 
38
  def encode(sentences):
39
  batch_size = len(sentences)
40
- sentences = [s+tokenizer.eos_token for s in sentences]
41
- tokenized_inputs = tokenizer(sentences, padding=True, return_tensors='pt', add_special_tokens=False).to(model.device)
42
  last_hidden_state = model(**tokenized_inputs).last_hidden_state
43
  eos_positions = tokenized_inputs.attention_mask.sum(dim=1) - 1
44
  embeddings = last_hidden_state[torch.arange(batch_size, device=model.device), eos_positions]
45
  embeddings = F.normalize(embeddings, p=2, dim=1)
46
  return embeddings
47
 
48
- embeddings = encode(sentences)
 
 
 
 
 
 
 
 
 
 
49
  ```
50
 
51
  ## Evaluation
 
5
  - codefuse-ai/F2LLM
6
  language:
7
  - en
8
+ tags:
9
+ - transformers
10
  license: apache-2.0
11
  pipeline_tag: feature-extraction
12
+ library_name: sentence-transformers
13
  ---
14
 
15
  # F2LLM Technical Report: Matching SOTA Embedding Performance with 6 Million Open-Source Data
 
20
 
21
  ## Usage
22
 
23
+ ### With Sentence Transformers
24
+
25
+ To encode text using F2LLM with the [Sentence Transformers](https://www.sbert.net/) library:
26
+
27
+ ```python
28
+ from sentence_transformers import SentenceTransformer
29
+
30
+ model = SentenceTransformer("codefuse-ai/F2LLM-0.6B", model_kwargs={"torch_dtype": "bfloat16"})
31
+
32
+ # Some sample query and documents
33
+ query = "What is F2LLM used for?"
34
+ documents = [
35
+ 'We present F2LLM, a family of fully open embedding LLMs that achieve a strong balance between model size, training data, and embedding performance.',
36
+ 'Model checkpoints, training datasets, and training code are released, positioning F2LLM as a strong, reproducible, and budget-friendly baseline for future research in text embedding models.',
37
+ 'F2LLM is a model for computing text embeddings that can be used for various NLP tasks such as information retrieval, semantic search, and text classification.'
38
+ ]
39
+
40
+ # Encode the query and documents separately, the encode_query method uses the query prompt
41
+ query_embedding = model.encode_query(query)
42
+ document_embeddings = model.encode_document(documents)
43
+ print(query_embedding.shape, document_embeddings.shape)
44
+ # (1024,) (3, 1024)
45
+
46
+ # Compute cosine similarity between the query and documents
47
+ similarity = model.similarity(query_embedding, document_embeddings)
48
+ print(similarity)
49
+ # tensor([[0.5132, 0.5376, 0.8017]])
50
+ ```
51
+
52
+ ### With Transformers
53
+
54
+ Or directly with the [Transformers](https://huggingface.co/docs/transformers/index) library:
55
 
56
  ```python
57
  from transformers import AutoModel, AutoTokenizer
 
63
  tokenizer = AutoTokenizer.from_pretrained(model_path)
64
  model = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, device_map={'': 0})
65
 
66
+ query = "What is F2LLM used for?"
67
+ query_prompt = "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery:"
68
+ documents = [
69
  'We present F2LLM, a family of fully open embedding LLMs that achieve a strong balance between model size, training data, and embedding performance.',
70
+ 'Model checkpoints, training datasets, and training code are released, positioning F2LLM as a strong, reproducible, and budget-friendly baseline for future research in text embedding models.',
71
+ 'F2LLM is a model for computing text embeddings that can be used for various NLP tasks such as information retrieval, semantic search, and text classification.'
72
  ]
73
 
74
  def encode(sentences):
75
  batch_size = len(sentences)
76
+ tokenized_inputs = tokenizer(sentences, padding=True, return_tensors='pt').to(model.device)
 
77
  last_hidden_state = model(**tokenized_inputs).last_hidden_state
78
  eos_positions = tokenized_inputs.attention_mask.sum(dim=1) - 1
79
  embeddings = last_hidden_state[torch.arange(batch_size, device=model.device), eos_positions]
80
  embeddings = F.normalize(embeddings, p=2, dim=1)
81
  return embeddings
82
 
83
+ # Encode the query and documents
84
+ query_embedding = encode([query_prompt + query])
85
+ document_embeddings = encode(documents)
86
+ print(query_embedding.shape, document_embeddings.shape)
87
+ # torch.Size([1, 1024]) torch.Size([3, 1024])
88
+
89
+ # Compute cosine similarity between the query and documents
90
+ similarity = query_embedding @ document_embeddings.T
91
+ print(similarity)
92
+ # tensor([[0.5039, 0.5312, 0.7930]], device='cuda:0', dtype=torch.bfloat16,
93
+ # grad_fn=<MmBackward0>)
94
  ```
95
 
96
  ## Evaluation
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
- size 11422654
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38360d5a512a43641b36d6fba2df87b8a3f5464c6b5c76f03e82d6d795175566
3
+ size 11423195