0. Outline¶
Two types of models:
- Open Source (logits accessible):
- tokenization
- getting logits and log probabilities
- generation / prompting
- Closed Source:
- prompting
If you are curious about using SLOR
:
- How to obtain unigram probabilities?
- given a model, identify a portion of its training corpus;
- tokenize a fragment of this corpus;
- count individual tokens' occurrence frequency;
Prerequsit:
!pip install transformers torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
# from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import torch.nn.functional as F
import json
from tqdm import tqdm
# from collections import Counter
Loading a model and its tokenizer¶
In addition to the GPT2
family used in this tutorial, a common alternative is the Llama
family. Go to Huggingface to check out a variety of models you can run.
# Choose model: GPT-2, LLaMA, etc.
model_name = "gpt2" # or "meta-llama/Llama-2-7b-hf" if using LLaMA 2
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval() # turn off dropout
if torch.cuda.is_available():
model = model.cuda()
Part I: Open Source Models¶
1.1 Check properties of the tokenizer¶
Summary:
- From string (e.g. sentences) to token_id (list):
tokenizer.encode(sentence)
- From string to token_id (tensor) & masking (tensor):
tokenizer(sentence, return_tensors="pt")
- See individual string tokens given token_ids (list):
tokenizer.convert_ids_to_tokens(token_ids)
; - Convert token_ids (list) back to string:
tokenizer.decode(token_ids)
sentence = "Every morning, I drink a big glass of"
tokenizer.encode(sentence)
[6109, 3329, 11, 314, 4144, 257, 1263, 5405, 286]
Converting between sentences and tokens¶
tokenizer(sentence, return_tensors="pt")
{'input_ids': tensor([[6109, 3329, 11, 314, 4144, 257, 1263, 5405, 286]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}
# Tokenize into input IDs
input_ids = tokenizer.encode(sentence, add_special_tokens=False)
tokens = tokenizer.convert_ids_to_tokens(input_ids)
print("Original text:", sentence)
print("Token IDs:", input_ids)
print("Tokens:", tokens)
Original text: Every morning, I drink a big glass of Token IDs: [6109, 3329, 11, 314, 4144, 257, 1263, 5405, 286] Tokens: ['Every', 'Ġmorning', ',', 'ĠI', 'Ġdrink', 'Ġa', 'Ġbig', 'Ġglass', 'Ġof']
# Convert tokens to text (joined by tokenizer rules)
decoded_text = tokenizer.decode(input_ids)
print("Decoded text:", decoded_text)
Decoded text: Every morning, I drink a big glass of
Special Tokens¶
print("Tokenizer vocab size:", tokenizer.vocab_size)
print("Tokenizer special tokens:", tokenizer.special_tokens_map)
print("Padding token:", tokenizer.pad_token)
print("EOS token:", tokenizer.eos_token)
print("UNK token:", tokenizer.unk_token)
Using pad_token, but it is not set yet.
Tokenizer vocab size: 50257 Tokenizer special tokens: {'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'} Padding token: None EOS token: <|endoftext|> UNK token: <|endoftext|>
Tokenizers may not behave as you expect¶
phrases = ["cat", " cats", "The cat", "The cat's", "Thecat", "chihuahua", 'substance', 'superlative']
for phrase in phrases:
ids = tokenizer.encode(phrase, add_special_tokens=False)
toks = tokenizer.convert_ids_to_tokens(ids)
print(f"\nText: '{phrase}'")
print("Tokens:", toks)
Text: 'cat' Tokens: ['cat'] Text: ' cats' Tokens: ['Ġcats'] Text: 'The cat' Tokens: ['The', 'Ġcat'] Text: 'The cat's' Tokens: ['The', 'Ġcat', "'s"] Text: 'Thecat' Tokens: ['The', 'cat'] Text: 'chihuahua' Tokens: ['ch', 'ihu', 'ah', 'ua'] Text: 'substance' Tokens: ['sub', 'st', 'ance'] Text: 'superlative' Tokens: ['super', 'l', 'ative']
Processing multiple strings¶
# List of strings to tokenize
texts = [
"The cat sat on the mat.",
"A quick brown fox.",
"GPT-2 is a language model."
]
tokenizer.pad_token = tokenizer.eos_token
# Tokenize with padding and truncation
encoded = tokenizer(
texts,
padding=True, # pad to the longest sequence
truncation=True, # truncate if needed (max_length optional)
return_tensors="pt", # return PyTorch tensors
return_attention_mask=True,
return_token_type_ids=False
)
# Print raw inputs and tokenized outputs
for i, text in enumerate(texts):
print(f"\nText {i+1}: {text}")
input_ids = encoded["input_ids"][i]
tokens = tokenizer.convert_ids_to_tokens(input_ids)
print("Token IDs:", input_ids.tolist())
print("Tokens :", tokens)
print(encoded['input_ids'].shape) #[batch_size, seq_len]
Text 1: The cat sat on the mat. Token IDs: [464, 3797, 3332, 319, 262, 2603, 13, 50256, 50256] Tokens : ['The', 'Ġcat', 'Ġsat', 'Ġon', 'Ġthe', 'Ġmat', '.', '<|endoftext|>', '<|endoftext|>'] Text 2: A quick brown fox. Token IDs: [32, 2068, 7586, 21831, 13, 50256, 50256, 50256, 50256] Tokens : ['A', 'Ġquick', 'Ġbrown', 'Ġfox', '.', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>'] Text 3: GPT-2 is a language model. Token IDs: [38, 11571, 12, 17, 318, 257, 3303, 2746, 13] Tokens : ['G', 'PT', '-', '2', 'Ġis', 'Ġa', 'Ġlanguage', 'Ġmodel', '.']
1.2 Get the log_prob assigned by the model to the sentence¶
Run the model on the input and get the logits¶
with torch.no_grad(): # this is not necessary if model is in eval mode
tokens = torch.tensor(tokenizer.encode(sentence))
outputs = model(tokens)
print(tokens.shape)
logits = outputs.logits # shape: (batch_size, seq_len, vocab_size)
probs = F.log_softmax(logits, dim=-1)
print(outputs.keys())
print("Logits shape:", logits.shape)
torch.Size([9]) odict_keys(['logits', 'past_key_values']) Logits shape: torch.Size([9, 50257])
inputs = tokenizer(sentence, return_tensors="pt", add_special_tokens=False)
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits # shape: (batch_size, seq_len, vocab_size)
probs = F.log_softmax(logits, dim=-1)
print(inputs['input_ids'].shape)
print("Logits shape:", logits.shape)
torch.Size([1, 9]) Logits shape: torch.Size([1, 9, 50257])
Convert the logits into probability distributions¶
torch.gather(input, dim, index, *)
:
out[i][j][k] = input[index[i][j][k]][j][k] # if dim == 0
out[i][j][k] = input[i][index[i][j][k]][k] # if dim == 1
out[i][j][k] = input[i][j][index[i][j][k]] # if dim == 2
# Get token-level log probabilities
input_ids = inputs["input_ids"]
log_probs = torch.gather(probs[:, :-1, :], 2, input_ids[:, 1:].unsqueeze(-1)).squeeze(-1)
print(log_probs)
print(probs[:, :-1, :].shape)
print(input_ids[:, 1:].unsqueeze(-1).shape)
tensor([[-7.9162, -1.1918, -1.6216, -7.0002, -1.8646, -4.9324, -1.0927, -0.0740]]) torch.Size([1, 8, 50257]) torch.Size([1, 8, 1])
Check individual tokens' probabilities¶
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
print("Token\tLogProb")
for token, lp in zip(tokens[1:], log_probs[0]):
print(f"{token}\t{lp.item():.4f}")
Token LogProb Ġmorning -7.9162 , -1.1918 ĠI -1.6216 Ġdrink -7.0002 Ġa -1.8646 Ġbig -4.9324 Ġglass -1.0927 Ġof -0.0740
Predict the next word¶
# Let's see what word GPT-2 assigns the highest
# probability to!
# We use -1 to look at the last position - i.e.,
# what is predicted to follow the last input word?
# argmax then gives us the index that has the highest
# value. And using tokenizer.decode lets us see what
# word has that index.
pred_next = outputs["logits"][0, -1].argmax(dim=-1)
word = tokenizer.decode([pred_next])
print(word)
wine
# What if we want to see more than just the single
# most likely word? For this, we can use the
# function topk
topv, topi = torch.topk(outputs["logits"][0,-1], 10)
# Let's see what all these topk words were
for token in topi:
print(tokenizer.decode([token]))
wine water coffee milk beer tea red vodka orange champagne
1.3 Do generation / prompting¶
# Free generation with temperature and sampling
prompt = "Once upon a time"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
if torch.cuda.is_available():
input_ids = input_ids.cuda()
set_seed(42) # For reproducibility
# shape = [batch_size = 1, seq_len = 54]
generated = model.generate(
input_ids,
max_new_tokens=50, # generate at most 50 new tokens
temperature=0.8, # randomness in sampling: higher = more random, lower = more predictive
top_k=50, # limits sampling to the top k most likely next tokens (set to 0 to disable, set to 1 for greedy sampling)
top_p=0.95, # nucleus sampling: limits sampling to the top p most likely next tokens with cumulative probability >= 95%
do_sample=True, # sample instead of greedy decoding or beam search
pad_token_id=tokenizer.eos_token_id
)
output_text = tokenizer.decode(generated[0], skip_special_tokens=True)
print("\nGenerated text:\n", output_text)
Generated text: Once upon a time, there was a certain person in the world who believed, that if one wished to be a man, one should not act according to nature. And yet, what you did, you made yourself a man. And what you did not, you did
generated = model.generate(
input_ids,
max_new_tokens=50, # generate at most 50 new tokens
temperature=0.8, # randomness in sampling: higher = more random, lower = more predictive
top_k=1, # limits sampling to the top k most likely next tokens (set to 0 to disable, set to 1 for greedy sampling)
top_p=0.95, # nucleus sampling: limits sampling to the top p most likely next tokens with cumulative probability >= 95%
do_sample=True, # sample instead of greedy decoding or beam search
pad_token_id=tokenizer.eos_token_id
)
output_text = tokenizer.decode(generated[0], skip_special_tokens=True)
print("\nGenerated text:\n", output_text)
Generated text: Once upon a time, the world was a place of great beauty and great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger
Part II: Closed-source Models¶
You need to pay (for a tiny amount for the scale of a class project) to run closed-source language models, e.g., GPT-4o from OpenAI. To do so, you need to register an account on OpenAI and get a API Key.
Set Up OpenAI-related information¶
import openai
OPENAI_API_KEY="_YOUR_OPENAI_KEY" # replace with your OpenAI API key, and the client.completion.create() function will use this key to authenticate requests
client = openai.OpenAI(api_key=OPENAI_API_KEY)
Some legacy version of GPT3 (but not GPT3.5 or above) returns token-level log_prob¶
examples = [{"sent": "The cat sat on the mat."},
{"sent": "A quick brown fox."},
{"sent": "GPT-2 is a language model."},
{"sent": "The cat sat on the mat."},
{"sent": "A quick brown fox."},
{"sent": "GPT-2 is a language model."}]
sentences = [example["sent"] for example in examples]
for i in tqdm(range(len(sentences))):
response = client.completions.create(
model="babbage-002", # this is still valid for logprobs
prompt=sentences[i], # just a string, not a list
max_tokens=0, # to just score the prompt
temperature=0.0,
logprobs=0,
echo=True)
examples[i]["logprobs"] = response.choices[0].logprobs.token_logprobs
examples[i]["tokens"] = response.choices[0].logprobs.tokens
examples[i]["top_logprobs"] = response.choices[0].logprobs.top_logprobs
examples[i]["text"] = response.choices[0].text
100%|██████████| 6/6 [00:07<00:00, 1.25s/it]
examples
[{'sent': 'The cat sat on the mat.', 'logprobs': [None, -9.086615, -4.321738, -1.3251734, -0.5576507, -1.7575874, -1.9625852], 'tokens': ['The', ' cat', ' sat', ' on', ' the', ' mat', '.'], 'top_logprobs': None, 'text': 'The cat sat on the mat.'}, {'sent': 'A quick brown fox.', 'logprobs': [None, -8.135078, -8.417381, -0.029080056, -6.401503], 'tokens': ['A', ' quick', ' brown', ' fox', '.'], 'top_logprobs': None, 'text': 'A quick brown fox.'}, {'sent': 'GPT-2 is a language model.', 'logprobs': [None, -8.272442, -2.0609777, -0.8916882, -3.6185596, -1.4614378, -5.422453, -1.007168, -3.5893934], 'tokens': ['G', 'PT', '-', '2', ' is', ' a', ' language', ' model', '.'], 'top_logprobs': None, 'text': 'GPT-2 is a language model.'}, {'sent': 'The cat sat on the mat.', 'logprobs': [None, -9.086615, -4.321738, -1.3251734, -0.5576507, -1.7575874, -1.9625852], 'tokens': ['The', ' cat', ' sat', ' on', ' the', ' mat', '.'], 'top_logprobs': None, 'text': 'The cat sat on the mat.'}, {'sent': 'A quick brown fox.', 'logprobs': [None, -8.135078, -8.417381, -0.029080056, -6.401503], 'tokens': ['A', ' quick', ' brown', ' fox', '.'], 'top_logprobs': None, 'text': 'A quick brown fox.'}, {'sent': 'GPT-2 is a language model.', 'logprobs': [None, -8.272442, -2.0609777, -0.8916882, -3.6185596, -1.4614378, -5.422453, -1.007168, -3.5893934], 'tokens': ['G', 'PT', '-', '2', ' is', ' a', ' language', ' model', '.'], 'top_logprobs': None, 'text': 'GPT-2 is a language model.'}]
The same applies to GPT-4o¶
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Explain how black holes form in simple terms."}
],
temperature=0.7,
max_tokens=50,
top_p=1.0,
frequency_penalty=0.0,
presence_penalty=0.0
)
print(response.choices[0].message.content)
Black holes form from the remnants of massive stars. Here's a simple way to understand the process: 1. **Life of a Star**: Stars are massive celestial bodies primarily composed of hydrogen and helium. They shine by fusing hydrogen into helium in their
Part III: Estimating Unigram Probability for SLOR¶
Load a fragment of the target Dataset¶
Here, we use an open-source reconstruction of the OpenWebText corpus used to train the GPT2 family. The current dataset is loaded from this Huggingface Dataset. You could look for alternative datasets depending on the model you are working with (which decides the corresponding dataset) and the size.
from datasets import load_dataset
dataset_name = "stas/openwebtext-10k"
name = dataset_name.split('/')[-1]
ds = load_dataset(dataset_name, split='train')
ds.to_json(f"{name}.jsonl", orient="records", lines=True)
Found cached dataset openwebtext-10k (/Users/herbertzhou/.cache/huggingface/datasets/stas___openwebtext-10k/plain_text/1.0.0/3a8df094c671b4cb63ed0b41f40fb3bd855e9ce2e3765e5df50abcdfb5ec144b)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Creating json from Arrow format: 0%| | 0/10 [00:00<?, ?ba/s]
50978737
with open('openwebtext-10k.jsonl', 'r') as f:
data = [json.loads(line) for line in f]
print(data[0]['text'])
A magazine supplement with an image of Adolf Hitler and the title 'The Unreadable Book' is pictured in Berlin. No law bans “Mein Kampf” in Germany, but the government of Bavaria, holds the copyright and guards it ferociously. (Thomas Peter/REUTERS) The city that was the center of Adolf Hitler’s empire is littered with reminders of the Nazi past, from the bullet holes that pit the fronts of many buildings to the hulking Luftwaffe headquarters that now house the Finance Ministry. What it doesn’t have, nor has it since 1945, are copies of Hitler’s autobiography and political manifesto, “Mein Kampf,” in its bookstores. The latest attempt to publish excerpts fizzled this week after the Bavarian government challenged it in court, although an expurgated copy appeared at newspaper kiosks around the country. But in Germany — where keeping a tight lid on Hitler’s writings has become a rich tradition in itself — attitudes toward his book are slowly changing, and fewer people are objecting to its becoming more widely available. No law bans “Mein Kampf” in Germany, but the government of Bavaria, where Hitler officially resided at the time of his 1945 suicide, holds the copyright and guards it ferociously. German-language copies that were printed before 1945 are legal, although they command a premium price, and the book is available in translation elsewhere in the world. But the question of whether to publish it in the country where Hitler plotted his empire has lost some of its edge in the Google era, when a complete German-language copy of the book pops up as the second result on the local version of the search engine. “To say this is a very dangerous book, we must ban it, this is ridiculous,” said Wolfgang Wippermann, a professor of modern history at the Free University of Berlin. “Maybe it was necessary once, but now it’s over, it makes no sense. You can find it so easily.” The publisher of the excerpts, London-based Albertus, has said it will appeal the Bavarian government’s injunction. In 2009, the publisher beat charges of copyright violation and the illegal use of Nazi symbols after the Bavarian government seized reprinted copies of the Nazi Party’s in-house newspaper. The attempt to publish portions of “Mein Kampf” on Thursday was scuttled at the last moment, although the publisher, ready to capitalize on the publicity, had printed two versions of the pamphlet. The version propped on top of a heap of celebrity magazines at a newsstand in Berlin’s central Friedrichstrasse station was a slender, blue, 16-page leaflet that has historical commentary in one column and an image of blurred text stamped with “Unreadable” in the other, accompanied by two reproductions of Nazi-era newspapers. “Mein Kampf” “is an awful book, and the whole thinking is absolutely not ours, but we have another view on it regarding the idea of packing it away. This idea is just naive,” said Alexander Luckow, a spokesman for the publisher. “In a free country, you need to discuss these very bad parts of German history.” Still, he said, there are limits, and using Hitler’s words as inspiration, not as historical artifact, is where it crosses the line. “The danger is allowing right-wing people to sell it in bookshops with their modern commentary,” he said. “This is forbidden and it’s good . . . not only in Germany, this should be equal in other countries in Europe. Anti-Semitism is not confined to Germany. You look and it’s all around Europe, dating back to the Middle Ages.” The debate will soon be over, whether or not the latest excerpts make it to newsstands. German law extends copyright 70 years after the author’s death; after 2015, “Mein Kampf” will be fair game. Some in Bavaria’s government worry that neo-Nazis will publish their own version of the book shortly thereafter, and to counter that, they are encouraging a scholarly edition. A group of historians is preparing it. Germany’s Jewish organizations have approached the publication with mixed emotions, sensitive that their country still has problems with neo-Nazis and anti-Semitism. The German government released a study this week that found that one in five Germans has anti-Semitic attitudes. And a neo-Nazi ring that has been linked to at least nine killings before it was shut down in November shocked Germans who thought they had done a thorough job working through their past. “I do very well without any publishing of ‘Mein Kampf,’ ” said Dieter Graumann, the head of the Central Council of Jews in Germany. “In a few years, it will be free, and I have every trust in the democratic maturity of the German people. . . . But for the moment, I am glad it is not.”
vocab = {}
for line in tqdm(data[:300]): # you may want to do extra processing here depending on the dataset
for sen in line['text'].split('\n'):
for token in tokenizer.tokenize(sen):
vocab[token] = vocab.get(token, 0) + 1
100%|██████████| 300/300 [00:00<00:00, 344.25it/s]
vocab
{'A': 174, 'Ġmagazine': 18, 'Ġsupplement': 5, 'Ġwith': 1536, 'Ġan': 895, 'Ġimage': 29, 'Ġof': 6308, 'ĠAdolf': 2, 'ĠHitler': 8, 'Ġand': 5753, 'Ġthe': 11839, 'Ġtitle': 44, "Ġ'": 110, 'The': 773, 'ĠUn': 13, 'readable': 2, 'ĠBook': 8, "'": 179, 'Ġis': 2303, 'Ġpictured': 1, 'Ġin': 4271, 'ĠBerlin': 8, '.': 10805, 'ĠNo': 40, 'Ġlaw': 126, 'Ġbans': 4, 'ĠâĢ': 901, 'ľ': 1054, 'Me': 12, 'in': 152, 'ĠKamp': 7, 'f': 75, 'âĢ': 3641, 'Ŀ': 997, 'ĠGermany': 30, ',': 12059, 'Ġbut': 675, 'Ġgovernment': 248, 'ĠBav': 9, 'aria': 5, 'Ġholds': 14, 'Ġcopyright': 23, 'Ġguards': 10, 'Ġit': 1592, 'Ġfer': 4, 'oc': 27, 'iously': 7, 'Ġ(': 1171, 'Thomas': 3, 'ĠPeter': 10, '/': 310, 'RE': 5, 'UTERS': 2, ')': 669, 'Ġcity': 95, 'Ġthat': 2867, 'Ġwas': 1513, 'Ġcenter': 20, 'Ļ': 2338, 's': 1629, 'Ġempire': 4, 'Ġlittered': 2, 'Ġreminders': 1, 'ĠNazi': 11, 'Ġpast': 70, 'Ġfrom': 1002, 'Ġbullet': 7, 'Ġholes': 3, 'Ġpit': 14, 'Ġfronts': 3, 'Ġmany': 223, 'Ġbuildings': 28, 'Ġto': 6379, 'Ġh': 21, 'ul': 38, 'king': 12, 'ĠLu': 13, 'ft': 8, 'w': 35, 'affe': 3, 'Ġheadquarters': 11, 'Ġnow': 219, 'Ġhouse': 25, 'ĠFinance': 1, 'ĠMinistry': 12, 'What': 40, 'Ġdoesn': 91, 't': 504, 'Ġhave': 1100, 'Ġnor': 24, 'Ġhas': 863, 'Ġsince': 133, 'Ġ1945': 7, 'Ġare': 1085, 'Ġcopies': 8, 'Ġautobiography': 3, 'Ġpolitical': 98, 'Ġmanifesto': 2, 'Ġits': 417, 'Ġbook': 65, 'stores': 1, 'ĠThe': 873, 'Ġlatest': 40, 'Ġattempt': 26, 'Ġpublish': 6, 'Ġexcerpts': 4, 'Ġf': 75, 'izz': 3, 'led': 24, 'Ġthis': 877, 'Ġweek': 88, 'Ġafter': 275, 'arian': 17, 'Ġchallenged': 8, 'Ġcourt': 44, 'Ġalthough': 25, 'Ġexp': 1, 'urg': 2, 'ated': 39, 'Ġcopy': 17, 'Ġappeared': 39, 'Ġat': 957, 'Ġnewspaper': 17, 'Ġkios': 1, 'ks': 4, 'Ġaround': 158, 'Ġcountry': 96, 'But': 121, 'ĠâĢĶ': 260, 'Ġwhere': 203, 'Ġkeeping': 12, 'Ġa': 5109, 'Ġtight': 12, 'Ġlid': 4, 'Ġon': 1756, 'Ġwritings': 1, 'Ġbecome': 77, 'Ġrich': 13, 'Ġtradition': 12, 'Ġitself': 41, 'Ġattitudes': 6, 'Ġtoward': 27, 'Ġhis': 825, 'Ġslowly': 10, 'Ġchanging': 14, 'Ġfewer': 14, 'Ġpeople': 402, 'Ġobject': 6, 'ing': 176, 'Ġbecoming': 18, 'Ġmore': 627, 'Ġwidely': 17, 'Ġavailable': 51, 'No': 33, 'Ġofficially': 10, 'Ġresided': 1, 'Ġtime': 360, 'Ġsuicide': 9, 'ĠGerman': 18, '-': 2803, 'language': 3, 'Ġwere': 553, 'Ġprinted': 6, 'Ġbefore': 179, 'Ġlegal': 50, 'Ġthey': 748, 'Ġcommand': 10, 'Ġpremium': 2, 'Ġprice': 23, 'Ġtranslation': 2, 'Ġelsewhere': 11, 'Ġworld': 148, 'Ġquestion': 54, 'Ġwhether': 82, 'Ġplotted': 1, 'Ġlost': 45, 'Ġsome': 360, 'Ġedge': 12, 'ĠGoogle': 18, 'Ġera': 12, 'Ġwhen': 346, 'Ġcomplete': 18, 'Ġpops': 1, 'Ġup': 419, 'Ġas': 1388, 'Ġsecond': 89, 'Ġresult': 78, 'Ġlocal': 79, 'Ġversion': 36, 'Ġsearch': 21, 'Ġengine': 7, 'To': 31, 'Ġsay': 140, 'Ġvery': 145, 'Ġdangerous': 21, 'Ġwe': 577, 'Ġmust': 88, 'Ġban': 21, 'Ġridiculous': 7, 'Ġsaid': 855, 'ĠWolfgang': 3, 'ĠW': 43, 'ipp': 9, 'erman': 2, 'n': 73, 'Ġprofessor': 19, 'Ġmodern': 31, 'Ġhistory': 44, 'ĠFree': 10, 'ĠUniversity': 67, 'Maybe': 16, 'Ġnecessary': 21, 'Ġonce': 70, 'Ġover': 300, 'Ġmakes': 65, 'Ġno': 286, 'Ġsense': 48, 'ĠYou': 115, 'Ġcan': 577, 'Ġfind': 115, 'Ġso': 323, 'Ġeasily': 18, 'Ġpublisher': 8, 'ĠLondon': 67, 'based': 39, 'ĠAlbert': 6, 'us': 39, 'Ġwill': 641, 'Ġappeal': 11, 'Ġinjunction': 1, 'ĠIn': 246, 'Ġ2009': 24, 'Ġbeat': 23, 'Ġcharges': 16, 'Ġviolation': 8, 'Ġillegal': 25, 'Ġuse': 147, 'Ġsymbols': 1, 'Ġseized': 5, 'Ġreprinted': 1, 'ĠParty': 56, 'house': 7, 'Ġportions': 6, 'ĠThursday': 39, 'Ġsc': 15, 'utt': 2, 'Ġlast': 193, 'Ġmoment': 21, 'Ġready': 21, 'Ġcapitalize': 4, 'Ġpublicity': 4, 'Ġhad': 564, 'Ġtwo': 296, 'Ġversions': 21, 'Ġpamphlet': 1, 'Ġpro': 25, 'pped': 4, 'Ġtop': 65, 'Ġheap': 7, 'Ġcelebrity': 3, 'Ġmagazines': 3, 'Ġnews': 79, 'stand': 3, 'Ġcentral': 14, 'ĠFriedrich': 1, 'str': 4, 'asse': 3, 'Ġstation': 21, 'Ġslender': 2, 'Ġblue': 19, 'Ġ16': 51, 'page': 5, 'Ġleaf': 5, 'let': 20, 'Ġhistorical': 5, 'Ġcommentary': 5, 'Ġone': 478, 'Ġcolumn': 5, 'Ġblurred': 1, 'Ġtext': 22, 'Ġstamped': 2, 'Un': 5, 'Ġother': 373, 'Ġaccompanied': 7, 'Ġby': 1045, 'Ġreprodu': 1, 'ctions': 1, 'era': 5, 'Ġnewspapers': 3, 'is': 82, 'Ġawful': 6, 'Ġwhole': 48, 'Ġthinking': 23, 'Ġabsolutely': 11, 'Ġnot': 968, 'Ġours': 5, 'Ġanother': 128, 'Ġview': 46, 'Ġregarding': 16, 'Ġidea': 45, 'Ġpacking': 4, 'Ġaway': 105, 'ĠThis': 166, 'Ġjust': 337, 'Ġnaive': 4, 'ĠAlexander': 4, 'ĠLuck': 2, 'ow': 27, 'Ġspokesman': 16, 'Ġfor': 2042, 'In': 188, 'Ġfree': 82, 'Ġyou': 940, 'Ġneed': 146, 'Ġdiscuss': 16, 'Ġthese': 219, 'Ġbad': 51, 'Ġparts': 24, 'Still': 8, 'Ġhe': 1064, 'Ġthere': 413, 'Ġlimits': 14, 'Ġusing': 96, 'Ġwords': 31, 'Ġinspiration': 9, 'Ġartifact': 1, 'Ġcrosses': 6, 'Ġline': 67, 'Ġdanger': 13, 'Ġallowing': 16, 'Ġright': 161, 'wing': 33, 'Ġsell': 28, 'sh': 55, 'ops': 10, 'Ġtheir': 646, 'This': 112, 'Ġforbidden': 4, 'Ġgood': 160, 'Ġ.': 204, 'Ġonly': 295, 'Ġshould': 201, 'Ġbe': 1188, 'Ġequal': 12, 'Ġcountries': 46, 'ĠEurope': 28, 'ĠAnti': 6, 'Semitism': 3, 'Ġconfined': 2, 'Ġlook': 113, 'Ġall': 498, 'Ġdating': 7, 'Ġback': 162, 'ĠMiddle': 7, 'ĠAges': 3, 'Ġdebate': 19, 'Ġsoon': 31, 'Ġor': 896, 'Ġmake': 233, 'Ġextends': 9, 'Ġ70': 11, 'Ġyears': 242, 'Ġauthor': 12, 'Ġdeath': 45, ';': 182, 'Ġ2015': 83, 'Ġfair': 12, 'Ġgame': 178, 'ĠSome': 43, 'Ġworry': 7, 'Ġneo': 6, 'Nazis': 4, 'Ġown': 171, 'Ġshortly': 7, 'Ġthereafter': 4, 'Ġcounter': 22, 'Ġencouraging': 7, 'Ġscholarly': 3, 'Ġedition': 6, 'ĠA': 233, 'Ġgroup': 87, 'Ġhistorians': 1, 'Ġpreparing': 9, 'Germany': 3, 'ĠJewish': 8, 'Ġorganizations': 7, 'Ġapproached': 6, 'Ġpublication': 12, 'Ġmixed': 11, 'Ġemotions': 8, 'Ġsensitive': 10, 'Ġstill': 172, 'Ġproblems': 40, 'Ġanti': 92, 'Ġreleased': 41, 'Ġstudy': 51, 'Ġfound': 110, 'Ġfive': 79, 'ĠGermans': 5, 'Semitic': 1, 'ĠAnd': 151, 'Nazi': 5, 'Ġring': 14, 'Ġbeen': 584, 'Ġlinked': 17, 'Ġleast': 68, 'Ġnine': 22, 'Ġkillings': 8, 'Ġshut': 15, 'Ġdown': 157, 'ĠNovember': 26, 'Ġshocked': 5, 'Ġwho': 582, 'Ġthought': 48, 'Ġdone': 65, 'Ġthorough': 6, 'Ġjob': 62, 'Ġworking': 56, 'Ġthrough': 146, 'I': 312, 'Ġdo': 289, 'Ġwell': 204, 'Ġwithout': 107, 'Ġany': 282, 'Ġpublishing': 8, 'ĺ': 151, 'ĠDiet': 2, 'er': 76, 'ĠGra': 1, 'umann': 1, 'Ġhead': 57, 'ĠCentral': 14, 'ĠCouncil': 28, 'ĠJews': 11, 'Ġfew': 101, 'ĠI': 1092, 'Ġevery': 114, 'Ġtrust': 25, 'Ġdemocratic': 10, 'Ġmaturity': 2, 'ĠBut': 204, 'Ġam': 65, 'Ġglad': 6, 'For': 57, 'Ġtoday': 70, 'Ġpost': 64, 'd': 124, 'Ġlike': 357, 'Ġtake': 173, 'ĠCalifornia': 20, 'Ġvoter': 7, 'Ġinitiative': 7, 'Ġlegalize': 10, 'Ġpot': 19, 'ĠIf': 109, 'Ġmeasure': 26, 'Ġpasses': 5, 'Ġsky': 7, 'Ġfall': 36, 'Ġstates': 49, 'Ġprobably': 54, 'Ġlooking': 54, 'Ġsimilar': 46, 'Ġchanges': 31, 'Ġnear': 45, 'Ġfuture': 61, 'ĠOur': 28, 'Ġdrug': 19, 'Ġpolicy': 55, 'Ġcentury': 11, 'Ġsimply': 44, 'Ġworked': 39, 'Ġheart': 27, 'ening': 9, 'Ġsee': 157, 'Ġstate': 138, 'Ġattempting': 10, 'Ġmarijuana': 32, 'Ġstatistics': 10, 'Ġarrests': 8, 'Ġreally': 136, 'Ġshocking': 5, 'ĠAccording': 19, 'ĠDrug': 5, 'ĠPolicy': 7, 'ĠAlliance': 4, 'Ġwhich': 540, 'Ġfavor': 17, 'Ġlegalization': 7, 'Ġblacks': 5, 'Ġarrested': 17, 'Ġpossession': 13, 'Ġbetween': 133, 'Ġfour': 97, 'Ġtwelve': 2, 'Ġtimes': 61, 'Ġthan': 365, 'Ġwhites': 11, 'Ġeven': 223, 'Ġthough': 69, 'Ġstudies': 16, 'Ġconsistently': 5, 'Ġshown': 31, 'Ġsmoke': 4, 'Ġten': 15, 'Ġ500': 13, '000': 173, 'ĠThat': 105, 'Ġabsurd': 4, '!': 164, 'ĠThink': 4, 'Ġabout': 604, 'Ġhow': 255, 'Ġexpensive': 15, 'Ġcriminal': 28, 'Ġjustice': 29, 'Ġsystem': 91, 'Ġspends': 7, 'Ġ$': 215, '216': 1, 'Ġeach': 94, 'Ġjuvenile': 2, 'Ġinmate': 1, 'Ġprison': 26, 'Ġyet': 56, '8': 71, 'Ġper': 108, 'Ġstudent': 29, 'ĠOakland': 3, 'Ġschool': 70, 'ĠIt': 322, 'Ġseems': 37, 'Ġme': 222, 'Ġif': 386, 'Ġwant': 145, 'Ġlimit': 15, 'Ġspend': 25, 'Ġmoney': 95, 'Ġkids': 25, 'Ġhelping': 7, 'Ġthem': 337, 'Ġachieve': 12, 'Ġeconomic': 42, 'Ġbenefits': 29, 'Ġlegalizing': 1, 'Ġmind': 36, 'Ġblowing': 3, 'Ġlegalized': 5, 'Ġtaxed': 1, 'Ġsame': 137, 'Ġrate': 38, 'Ġtobacco': 1, 'Ġwould': 566, 'Ġsave': 24, 'Ġenforcement': 10, 'Ġgain': 9, 'Ġtax': 64, 'Ġrevenue': 13, 'Ġequals': 1, '17': 35, 'Ġbillion': 22, 'ĠAs': 78, 'ĠNicholas': 1, 'ĠKrist': 2, 'of': 35, 'Ġnotes': 16, 'Ġenough': 65, 'Ġsend': 29, 'Ġthree': 128, 'Ġyear': 214, 'Ġold': 57, 'Ġpoor': 13, 'Ġneighborhood': 10, 'Ġpre': 37, 'school': 6, 'ĠOr': 27, 'Ġcould': 271, 'Ġimproving': 3, 'Ġpublic': 168, 'Ġeducation': 19, 'Ġshore': 8, 'Ġborder': 19, 'Ġdefense': 25, 'ĠWhatever': 3, 'Ġexactly': 26, 'Ġtrivial': 6, 'Ġamount': 34, 'Ġbiggest': 25, 'Ġreason': 55, 'Ġhurt': 19, 'Ġcartels': 5, 'ĠImmigration': 1, 'Ġemerged': 8, 'Ġhot': 9, 'Ġbutton': 3, 'Ġissue': 78, 'Ġrecently': 38, 'ĠArizona': 14, 'Ġpassing': 16, 'Ġdraconian': 1, 'Ġimmigration': 13, 'Ġpropositions': 1, 'Ġbeing': 206, 'Ġconsidered': 28, 'ĠPeople': 16, 'Ġworried': 14, 'Ġviolence': 44, 'Ġunderstandably': 2, 'Ġwants': 31, 'Ġforeign': 42, 'Ġdealers': 1, 'Ġoperating': 24, 'Ġyard': 10, 'Ġmatter': 43, 'Ġlaws': 33, 'Ġpass': 24, 'Ġmuch': 196, 'ĠMexico': 4, 'ĠLatin': 6, 'ĠAmerican': 80, 'Ġalways': 92, 'Ġway': 192, 'Ġacross': 59, 'Ġimp': 4, 'orters': 2, 'Ġsmart': 14, 'Ġdemand': 16, 'Ġhigh': 100, 'Ġincreased': 32, 'Ġpatrols': 3, 'Ġagents': 16, 'Ġharsher': 1, 'Ġsentences': 6, 'Ġact': 35, 'Ġeffective': 19, 'Ġdeterrent': 2, 'ĠAmerica': 41, 'Ġmeans': 73, 'Ġlong': 130, 'Ġstays': 2, 'Ġviolent': 10, 'Ġoperate': 5, 'Ġour': 311, 'Ġborders': 8, 'Ġwhat': 396, 'Ġpushing': 15, 'Ġsuddenly': 13, '?': 331, 'Ġbuy': 28, 'Ġoff': 165, 'Ġstreet': 42, 'Ġinstead': 41, 'Ġwalk': 17, 'Ġinto': 394, 'Ġdispensary': 1, 'Ġquality': 27, 'Ġlegally': 10, 'Ġless': 108, 'Ġcharging': 3, 'ĠVery': 4, 'Ġactually': 78, 'Ġhide': 6, 'Ġgiven': 63, 'Ġchoice': 18, 'Ġsmokers': 1, 'Ġdrugs': 5, 'Ġseverely': 2, 'Ġweaken': 5, 'Ġdecrease': 12, 'Ġdeaths': 6, 'Ġrelated': 15, 'Ġtrafficking': 3, 'm': 149, 'Ġadvocating': 1, 'Ġhere': 141, 'Ġknow': 122, 'Ġruined': 2, 'Ġlives': 29, 'Ġexcess': 4, 'Ġtrue': 43, 'Ġgateway': 2, 'Ġdemon': 4, 'izing': 17, 'ĠJust': 13, 'Ġbecause': 263, 'Ġsomeone': 53, 'Ġsmokes': 1, 'Ġwhile': 155, 'Ġmean': 44, 'Ġperson': 59, 'Ġturn': 43, 'Ġheroin': 1, 'Ġaddict': 2, 'ĠYes': 22, 'Ġintox': 2, 'icates': 2, 'Ġalcohol': 16, 'Ġsensible': 4, 'Ġrestrictions': 6, 'Ġbuilt': 43, 'Ġsuch': 143, 'Ġmaking': 82, 'Ġdrive': 16, 'Ġunder': 154, 'Ġinfluence': 25, 'Ġthen': 212, 'An': 25, 'arch': 7, 'ists': 27, 'Ġsolidarity': 5, 'Ġpur': 7, 'ged': 8, 'Ġimmigrants': 18, 'ĠAg': 3, 'ios': 5, 'ĠPant': 2, 'ele': 2, 'imon': 4, 'as': 99, 'Ġventured': 3, 'Ġagain': 79, 'Ġopen': 56, 'Ġplayground': 5, 'Ġkept': 23, 'Ġlocked': 6, 'Ġfascists': 8, 'Ġsegregation': 4, 'Ġleading': 26, 'Ġbattle': 26, 'Ġriot': 4, 'Ġpolice': 166, 'On': 45, 'ĠTuesday': 33, 'Ġ9': 52, '06': 6, 'Ġanarchists': 3, 'Ġdaily': 19, 'Ġterror': 15, 'ised': 21, 'Ġfascist': 4, 'Ġthugs': 3, 'ĠGolden': 9, 'ĠDawn': 2, 'Ġneon': 1, 'azi': 1, 'Ġparty': 90, 'Ġallies': 10, 'Ġarea': 50, 'Ġmoved': 24, 'Ġun': 37, 'block': 4, 'Ġentrance': 3, 'Ġchildren': 76, 'Ġkeep': 67, 'Ġeffort': 25, 'Ġimpose': 4, 'Ġg': 23, 'ree': 2, 'Ġ"': 724, 'to': 54, 'Ġpreserve': 5, 'Ġblood': 24, 'Ġpurity': 1, 'Ġwhite': 56, 'Ġrace': 40, '"...': 2, 'While': 27, 'blocking': 1, 'Ġattacked': 14, 'Ġrouted': 1, 'Ġarrival': 4, 'Ġforces': 24, 'Ġengaged': 10, 'Ġaim': 14, 'Ġprotecting': 8, 'ĠDuring': 12, 'Ġclashes': 7, 'Ġpoliceman': 1, 'Ġinjured': 11, 'Ġprotesters': 38, 'ĠAfter': 51, 'Ġend': 114, 'reek': 3, 'Ġfather': 28, 'ĠMr': 88, 'ĠTas': 1, 'oul': 7, 'Ġdef': 4, 'ying': 7, 'Ġreign': 6, 'Ġtook': 78, 'Ġson': 30, 'Ġplay': 108, 'Ġcoveted': 2, 'ĠSoon': 6, 'Ġsurrounded': 2, 'Ġblocked': 3, 'Ġexit': 3, 'Ġthreatened': 16, 'Ġl': 30, 'inch': 9, 'Ġcalling': 21, 'Ġhim': 257, 'Ġtraitor': 2, 'Ġmanaged': 20, 'Ġhandle': 9, 'Ġchild': 33, 'Ġsympathetic': 2, 'Ġneighbor': 6, 'Ġfull': 66, 'Ġpresence': 24, 'Ġchief': 28, 'Ġstrong': 33, 'Ġpresent': 35, 'Ġscene': 17, 'Ġsolicitor': 2, 'Ġfigure': 23, 'Ġhuman': 48, 'Ġrights': 54, 'Ġactivist': 7, 'Ġpiled': 4, 'Ġeggs': 6, 'Ġher': 318, 'Ġlife': 92, 'Ġnew': 263, 'Ġtension': 5, 'Ġcomes': 41, 'Ġeuro': 11, 'election': 6, 'Ġascent': 3, 'ĠLA': 7, 'OS': 9, 'ĠPopular': 2, 'ĠOrthodox': 3, 'ĠAl': 41, 'arm': 5, 'Ġ4': 95, 'th': 112, 'Ġposition': 43, 'Ġ7': 107, '%': 106, 'Ġvote': 38, 'Ġcombination': 12, 'Ġgoverning': 4, "'s": 1201, 'Ġlandslide': 4, 'Ġdefeat': 9, 'Ġled': 48, 'Ġendorse': 3, 'Ġcore': 19, 'Ġextreme': 14, 'right': 9, 'Ġwing': 9, 'Ġpolicies': 25, 'Ġpledge': 5, 'Ġmass': 14, 'Ġsweeping': 3, 'Ġoperation': 11, 'Ġagainst': 156, 'Ġsupporters': 18, 'Ġwithin': 103, 'Ġsummer': 39, 'Ġconcentration': 6, 'Ġcamp': 32, 'Ġplanned': 14, 'ĠNATO': 4, 'Ġair': 34, 'base': 2, 'prop': 1, 'yr': 5, 'g': 45, 'os': 47, 'Ġdeemed': 6, 'Ġimpractical': 3, 'Ġcommitted': 20, 'Ġseveral': 104, 'Ġmilitary': 58, 'Ġcamps': 9, 'Ġdisgrace': 3, 'ful': 17, 'Ġhumanitarian': 3, 'Ġstandards': 13, 'Ġcapital': 28, 'Ġpurpose': 19, 'cle': 1, 'aning': 5, 'Ġforeigners': 6, '".': 45, 'Ġmeasures': 24, 'Ġdiscourse': 4, 'Ġlittle': 99, 'Ġsurprise': 11, 'Ġfamous': 10, 'Ġwanting': 7, 'Ġdis': 19, 'place': 5, 'Ġhomosexuals': 1, 'Ġdesert': 1, 'Ġislands': 3, 'Ġlate': 49, 'Ġ1970': 14, 'ĠFurthermore': 5, 'Law': 7, 'ĠOrder': 7, '"': 518, 'Ġcoming': 39, 'Ġalso': 439, 'Ġinclude': 40, 'Ġattack': 54, 'Ġanarchist': 1, 'Ġsquats': 2, 'Ġactions': 16, 'Ġmovement': 48, 'Ġhopes': 10, 'Ġvia': 57, 'Ġvirtual': 23, 'Ġoccupation': 17, 'ĠAthens': 4, 'Ġmonths': 73, 'Ġmodeled': 1, 'ĠOlympics': 3, 'Ġ2004': 15, 'Ġintroduction': 6, 'Ġdisputed': 7, 'Ġlegislation': 27, 'Ġeventually': 23, 'Ġrender': 3, 'Ġprotest': 39, 'Ġmarches': 2, 'ĠDue': 4, 'Ġlack': 33, 'Ġlegislative': 10, 'Ġmajority': 39, 'ĠMP': 16, 'Ġresorted': 1, 'Ġtrick': 2, 'Ġincreasing': 18, 'Ġtotal': 39, 'Ġnumber': 91, 'ĠMPs': 8, 'Ġnon': 58, 'elected': 3, 'Ġmember': 31, 'Ġliking': 4, 'Ġsession': 18, 'ĠParliament': 23, 'Ġdict': 2, 'atorial': 2, 'Ġrule': 49, 'Ġruthless': 3, 'Ġemployment': 6, 'Ġpar': 10, 'ast': 13, 'ate': 78, 'ĠLast': 15, 'Ġoffice': 50, 'Ġbombed': 1, 'ĠMarxist': 1, 'Ġguerrilla': 1, 'Ġseries': 51, 'Ġluxury': 4, 'Ġbro': 5, 'the': 101, 'ls': 4, 'Ġfrequ': 2, 'ented': 2, 'Ġruling': 8, 'Ġclass': 38, 'Ġdestroyed': 8, 'ĠAt': 48, 'Ġguard': 21, 'Ġexpectation': 3, 'Ġnext': 99, 'ĠSaturday': 18, 'ĠGay': 2, 'ĠPride': 6, 'Ġparade': 2, 'Ġevacuation': 1, 'Ġcourts': 6, 'town': 4, 'Ġoccupied': 14, 'Ġconstant': 12, 'Ġtarget': 29, 'Ġbourgeois': 1, 'Ġmedia': 72, 'Ġwaste': 6, 'Ġsupporting': 14, 'Ġmost': 247, 'Ġunamb': 1, 'iguous': 2, 'Ġmanner': 10, 'Ġ45': 17, 'year': 98, 'old': 72, 'high': 4, 'way': 22, 'Ġshooter': 16, 'Ġ12': 85, 'minute': 7, 'Ġshootout': 2, ...}
total_tokens = torch.tensor(list(vocab.values())).sum()
unigram_prob = vocab['Ġthe'] / total_tokens
unigram_prob
tensor(0.0375)