import json from collections import defaultdict import numpy as np def unigram_first_byte_prob(corpus_path: str, output_path: str): with open(corpus_path, 'r', encoding='utf-8') as f: utf8_byte_lines = [json.loads(line)['text'].encode('utf-8') for line in f] byte_counts = defaultdict(int) total_bytes = 0 for utf8_bytes in utf8_byte_lines: for byte in utf8_bytes: byte_counts[byte] += 1 total_bytes += 1 unigram_prob = [0.0] * 256 for byte, count in byte_counts.items(): unigram_prob[byte] = count / total_bytes for byte in range(256): p = unigram_prob[byte] # Get ASCII character representation (if printable) if 32 <= byte <= 126: char = chr(byte) elif byte in [9, 10, 13]: # Tab, Line Feed, Carriage Return char = {9: '\\t', 10: '\\n', 13: '\\r'}.get(byte) else: char = '' # Non-printable control character # Get common name/description for selected bytes desc = { 0: 'NUL (Null)', 9: 'HT (Horizontal Tab)', 10: 'LF (Line Feed)', 13: 'CR (Carriage Return)', 27: 'ESC (Escape)', 32: 'SPACE', 127: 'DEL (Delete)' }.get(byte, '') print(f"{byte}\t{p:.6f}\t\t{char}\t\t{desc}") with open(output_path, 'w') as f: json.dump(unigram_prob, f) def smoothing_unigram_prob(unigram_prob_path: str, smoothing_factor: float, output_path: str): with open(unigram_prob_path, 'r') as f: unigram_prob = json.load(f) unigram_prob_np = np.array(unigram_prob, dtype=np.float64) vocab_size = unigram_prob_np.shape[0] # Calculate smoothed probability smoothed_prob_np = unigram_prob_np * (1 - smoothing_factor) + smoothing_factor / vocab_size smoothed_prob = smoothed_prob_np.astype(np.float32).tolist() # Save smoothed probability with open(output_path, 'w') as f: json.dump(smoothed_prob, f) if __name__ == '__main__': unigram_prob_path = '/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/ac_unigram_probs/opencoder13G_unigram_prob.json' smoothing_factor = 0.2 smoothed_unigram_prob_path = '/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/ac_unigram_probs/opencoder13G_unigram_prob_smooth{}.json' # unigram_first_byte_prob( # 'subsample_opencoder.jsonl', # unigram_prob_path # ) smoothing_unigram_prob( unigram_prob_path, smoothing_factor, smoothed_unigram_prob_path.format(smoothing_factor) )