Basic Algorithm

💡

Observe the red and green list generation for the token 'brown'

import numpy as np
import hashlib
import random
# Define the model's vocabulary and the initial prompt
vocabulary = ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog', 'runs', 'and']
prompt = 'The quick brown'

def language_model(vocabulary):
	# Assign random logits to each word in the vocabulary
	logits = np.random.randn(len(vocabulary))
	# Convert logits to probabilities using softmax
	probabilities = np.exp(logits) / np.sum(np.exp(logits))
	return dict(zip(vocabulary, probabilities))

def partition_vocabulary(vocabulary, hash_value):
	random.seed(hash_value)
	shuffled_vocabulary = vocabulary[:]
	random.shuffle(shuffled_vocabulary)
	half_size = len(shuffled_vocabulary) // 2
	green_list = shuffled_vocabulary[:half_size]
	red_list = shuffled_vocabulary[half_size:]
	return green_list, red_list

def simulate_watermarking(vocabulary, prompt):
	token = prompt.split()[-1]
	# Use the language model to get the probability vector
	prob_vector = language_model(vocabulary)
	# Hash the token and seed the random number generator
	hash_value = hashlib.sha256(token.encode()).hexdigest()
	# Partition the vocabulary into red and green lists
	green_list, red_list = partition_vocabulary(vocabulary, hash_value)
	# Choose the next token from the green list with the highest probability
	next_token = max(green_list, key=lambda word: prob_vector[word])
	print("Probability vector:", prob_vector)
	print("Hash value:", hash_value)
	print("Red list:", red_list)
	print("Green list:", green_list)
	print("Next token:", next_token)
	print("The generated sequence: ", prompt+' '+ next_token)
simulate_watermarking(vocabulary, prompt)