source_files = [
    "TEXTCZ1.txt",
    "TEXTEN1.txt"
]

## load the text files
def load_text(filename) -> str:
    with open(filename, "r", encoding="utf-8") as f:
        return f.read()

texts = {file : load_text(file).split('\n') for file in source_files}

for text, words in texts.items():
    print(f"Text: {text}, word-count: {len(words)}, unique-words: {len(set(words))}")

lexicons = {file : set(texts[file]) for file in texts}
alphabets = {file : set("".join(texts[file])) for file in texts}

Text: TEXTCZ1.txt, word-count: 222413, unique-words: 42827
Text: TEXTEN1.txt, word-count: 221099, unique-words: 9608

from collections import defaultdict
from typing import List, Dict, Tuple

def get_single_word_prob(text : List[str]) -> Dict[str, float]:
    counts = defaultdict(int)
    total_words = 0
    for word in text:
        if word not in counts:
            counts[word] = 0
        counts[word] += 1
        total_words += 1
    return {word : count / total_words for word, count in counts.items()}

def get_pair_counts(text : List[str]) -> Tuple[Tuple[Tuple[str, str], int], int]:
    counts = defaultdict(int)
    total_pairs = 0
    for i, j in zip(text, text[1:]):
        counts[(i,j)] += 1
        total_pairs += 1
    return ([(pair, count) for pair, count in counts.items()], total_pairs)

def get_pair_joint_prob(text : List[str]) -> Dict[Tuple[str, str], float]:
    counts = defaultdict(int)
    total_pairs = 0
    for i, j in zip(text, text[1:]):
        if ((i,j) not in counts):
            counts[(i,j)] = 0
        counts[(i,j)] += 1
        total_pairs += 1
    return {pair : count / total_pairs for pair, count in counts.items()}

def get_conditional_prob(joint_prob : Dict[Tuple[str, str], float], word_prob : Dict[str, float]):
    return {pair : prob / word_prob[pair[0]] for pair, prob in joint_prob.items()}

from math import log2

def get_entropy(word_probs : Dict[str, float]) -> float:
    return -sum([word_probs[word] * log2(word_probs[word]) for word in word_probs.keys()])

def get_perplexity(word_probs : Dict[str, float]) -> float:
    return 2**get_entropy(word_probs)

def get_conditional_entropy(joint_probs : Dict[Tuple[str, str], float], cond_probs : Dict[Tuple[str, str], float]) -> float:
    return -sum([joint_probs[pair] * log2(cond_probs[pair]) for pair in joint_probs.keys()])

def get_conditional_perplexity(joint_probs : Dict[Tuple[str, str], float], cond_probs : Dict[Tuple[str, str], float]) -> float:
    return 2**get_conditional_entropy(joint_probs, cond_probs)

conditional_word_probs = {file : get_single_word_prob(texts[file]) for file in texts}
joint_probs = {file : get_pair_joint_prob(texts[file]) for file in texts}
cond_probs = {file : get_conditional_prob(joint_probs[file], conditional_word_probs[file]) for file in texts}

## count word entropy for the whole text
word_entropy = {text : get_entropy(conditional_word_probs[text]) for text in texts}

## count the conditional entrppy for the whole text
cond_entropy = {text : get_conditional_entropy(joint_probs[text], cond_probs[text]) for text in texts}
cond_perplexity = {text : get_conditional_perplexity(joint_probs[text], cond_probs[text]) for text in texts}

for text in texts:
    print(f"Text {text} has word entropy {format(word_entropy[text], ".3f")} and perplexity {format(get_perplexity(conditional_word_probs[text]), ".3f")}")

print()

for text in texts:
    print(f"Text {text} has conditional entropy {format(cond_entropy[text], ".3f")} and perplexity {format(cond_perplexity[text], ".3f")}")

Text TEXTCZ1.txt has word entropy 11.538 and perplexity 2974.496
Text TEXTEN1.txt has word entropy 9.037 and perplexity 525.469

Text TEXTCZ1.txt has conditional entropy 4.748 and perplexity 26.868
Text TEXTEN1.txt has conditional entropy 5.287 and perplexity 39.056

import matplotlib.pyplot as plt
from collections import defaultdict

def plot_top_word_pairs(texts, top_n : int, title : str):
    fig, ax = plt.subplots(figsize=(10, 6))

    for text in texts:
        unique_pairs, pairs_count = get_pair_counts(texts[text])
        first_word_pairs = defaultdict(int)
        for pair, count in unique_pairs:
            first_word_pairs[pair[0]] += count
        
        top_counts = sorted(first_word_pairs.values(), reverse=True)[:top_n]

        # Plot the rank (1st, 2nd, etc.) vs counts
        ranks = range(1, len(top_counts) + 1)
        ax.plot(ranks, top_counts, label=f"Text {text}")

    # Customize the plot
    ax.set_title(title)
    ax.set_xlabel('Rank (1st, 2nd, ...)')
    ax.set_ylabel('Counts')
    ax.legend()
    ax.grid(True)

    # Show plot
    plt.tight_layout()
    plt.show()

# Example usage:
plot_top_word_pairs(texts, 100, f'Top {100} Word Pair Counts by Rank for Each Text')

import random
from typing import List

def mess_up_chars(text : List[str], alphabet : str, prob : float) -> List[str]:
    new_text = []
    for word in text:
        new_word = []
        for char in word:
            if random.random() <= prob:
                new_word.append(random.choice(alphabet))
            else:
                new_word.append(char)
        new_text.append("".join(new_word))
    return new_text
    
def mess_up_words(text : List[str], lexicon : List[str], prob : float) -> List[str]:
    new_text = []
    for word in text:
        if random.random() <= prob:
            new_text.append(random.choice(lexicon))
        else:
            new_text.append(word)
    return new_text

import random
import pickle

REPETITIONS = 10


results = {}
probs = [.2, .1, .05, .01, .001, .0001, .00001]

for text in texts:
    results_single_text = {}
    for prob in probs:
        results_single_prob = {}
        for type in ["char", "word"]:
            random.seed(42)
            results_single_type = []
            for i in range(REPETITIONS):    
                if (type == "char"):
                    new_text = mess_up_chars(texts[text], list(alphabets[text]), prob)
                else:
                    new_text = mess_up_words(texts[text], list(lexicons[text]), prob)
                ## evalute the single experiment
                new_word_probs = get_single_word_prob(new_text)
                new_joint_probs = get_pair_joint_prob(new_text)
                new_cond_probs = get_conditional_prob(new_joint_probs, new_word_probs)
                new_cond_entropy = get_conditional_entropy(new_joint_probs, new_cond_probs)
                new_cond_perplexity = get_conditional_perplexity(new_joint_probs, new_cond_probs)
                results_single_type.append({"entropy" : new_cond_entropy, "perplexity" : new_cond_perplexity})
            
            ## agregaet one type results
            min_entropy = min([result["entropy"] for result in results_single_type])
            max_entropy = max([result["entropy"] for result in results_single_type])
            avg_entropy = sum([result["entropy"] for result in results_single_type]) / REPETITIONS
            min_perplexity = min([result["perplexity"] for result in results_single_type])
            max_perplexity = max([result["perplexity"] for result in results_single_type])
            avg_perplexity = sum([result["perplexity"] for result in results_single_type]) / REPETITIONS
            results_single_prob[type] = {"entropy" : {"min" : min_entropy, "max" : max_entropy, "avg" : avg_entropy}, "perplexity" : {"min" : min_perplexity, "max" : max_perplexity, "avg" : avg_perplexity}}

        ## aggregate one probability results
        results_single_text[format(prob, ".5f")] = results_single_prob
        
    ## aggregate one text results
    results[text] = results_single_text

with open("results.pkl", "wb") as f:
    pickle.dump(results, f)

import pickle
from utils import plot_results, create_tables
import pandas as pd

with open("results.pkl", "rb") as f:
    results = pickle.load(f)

# Call the function to plot
plot_results(results)

for table_name, table in create_tables(results).items():
    print(f"{table_name}")
    display(table)

Source: TEXTCZ1.txt, Messing: word

Source: TEXTCZ1.txt, Messing: char

Source: TEXTEN1.txt, Messing: word

Source: TEXTEN1.txt, Messing: char

messed_texts = {
    text: 
    {
        "char" : mess_up_chars(texts[text], list(alphabets[text]), 0.2),
        "word" : mess_up_words(texts[text], list(lexicons[text]), 0.2)
    }
    for text in texts
}

messed_texts_char = {text : messed_texts[text]["char"] for text in texts}
messed_texts_word = {text : messed_texts[text]["word"] for text in texts}

for text in texts:
    print(f"Evaluating {text}...")

    for messing_type in ["char", "word"]:

        messed_text = messed_texts[text][messing_type]

        num_of_unique_words = len(lexicons[text])
        num_of_unique_words_messed = len(set(messed_text))
        diff_words = num_of_unique_words_messed - num_of_unique_words
        prct_words = num_of_unique_words_messed / num_of_unique_words * 100 if diff_words > 0 else (1 - num_of_unique_words_messed / num_of_unique_words) * 100

        pair_counts = get_pair_counts(texts[text])
        pair_counts_messed = get_pair_counts(messed_text)

        ## calculate the number of pairs that start with the same word
        same_first_word_counter = defaultdict(int)
        for pair, count in pair_counts[0]:
            same_first_word_counter[pair[0]] += 1

        same_first_word_counter_messed = defaultdict(int)
        for pair, count in pair_counts_messed[0]:
            same_first_word_counter_messed[pair[0]] += 1

        single_pair_count = len([count for count in same_first_word_counter.values() if count == 1])
        single_pair_count_messed = len([count for count in same_first_word_counter_messed.values() if count == 1])
        diff_pairs = single_pair_count_messed - single_pair_count
        prct_pairs = single_pair_count_messed / single_pair_count * 100 if (diff_pairs > 0) else (1 - single_pair_count_messed / single_pair_count) * 100


        print(f"    Messed with: {messing_type}:")
        print(f"        {"+" if diff_words > 0 else ""}{diff_words} words ({"+" if diff_words > 0 else "-"}{prct_words:.2f}%)")
        print(f"        {"+" if diff_pairs > 0 else ""}{diff_pairs} pairs ({"+" if diff_pairs > 0 else "-"}{prct_pairs:.2f}%)")

Evaluating TEXTCZ1.txt...
    Messed with: char:
        +76477 words (+278.57%)
        +84003 pairs (+398.72%)
    Messed with: word:
        -1939 words (-4.53%)
        -17350 pairs (-61.70%)
Evaluating TEXTEN1.txt...
    Messed with: char:
        +85030 words (+984.99%)
        +80396 pairs (+1959.73%)
    Messed with: word:
        -11 words (-0.11%)
        -4239 pairs (-98.06%)

from typing import List, Dict, Tuple
from collections import defaultdict
from collections import Counter

datasets : Dict[str, Dict[str, List[str]]] = {}

for text in texts:
    datasets[text] = {
        "test" : texts[text][-20_000:], ## last 20000 words
        "heldout" : texts[text][:-20_000][-40_000:], ## last 40000 words from the what is left without the test
        "train" : texts[text][:-60_000] ## the rest
    }
## extract word counts from the train dataset for each ngram size and text
ngram_sizes = [3, 2, 1, 0]

counts : Dict[str, Dict[int, Counter]] = {
    text : {
        1 : Counter(datasets[text]["train"][:-2]),
        2 : Counter(zip(datasets[text]["train"][:-1], datasets[text]["train"][1:-1])),
        3 : Counter(zip(datasets[text]["train"], datasets[text]["train"][1:], datasets[text]["train"][2:])),
    }
    for text in texts
}

## calculate the conditional probability of each word given the previous n-1 words
ngram_probs = {}

for text in counts.keys():
    uniform_prob = 1 / len(set(datasets[text]["train"]))
    unigram_prob = {word : count / len(datasets[text]["train"]) for word, count in counts[text][1].items()}
    bigram_prob = {pair : count / counts[text][1][pair[0]] for pair, count in counts[text][2].items()}
    trigram_prob = {triple : count / counts[text][2][triple[:2]] for triple, count in counts[text][3].items()}

    ngram_probs[text] = {0 : uniform_prob, 1 : unigram_prob, 2 : bigram_prob, 3 : trigram_prob}

## compute four smoothing parameters from the heldout dataset using EM algorithm
def em_algo(text : List[str], 
            uniform_prob : float,
            unigram_cond_probs : Dict[Tuple[str], float], 
            bigram_cond_probs : Dict[Tuple[str, str], float], 
            trigram_cond_probs : Dict[Tuple[str, str, str], float], 
            lambdas : List[float], 
            epsilon : float = 1e-6) -> List[float]:

    ## retransform n-grams to the form of Dict[history, Dict[word, prob]]
    bigram_cond_probs_transformed : Dict['str', Dict['str', float]] = {}
    for (history, word), prob in bigram_cond_probs.items():
        if history not in bigram_cond_probs_transformed:
            bigram_cond_probs_transformed[history] = {}
        bigram_cond_probs_transformed[history][word] = prob

    trigram_cond_probs_transformed : Dict[Tuple[str, str], Dict[str, float]] = {}
    for (history_0, history_1, word), prob in trigram_cond_probs.items():
        if (history_0, history_1) not in trigram_cond_probs_transformed:
            trigram_cond_probs_transformed[(history_0, history_1)] = {}
        trigram_cond_probs_transformed[(history_0, history_1)][word] = prob

    prev_lambdas = lambdas[:]

    ## iterate until convergence
    while True:
        expected_counts = [0] * len(lambdas)
        for i in range(len(text) - 2):
            wi_2, wi_1, wi = text[i-2], text[i-1], text[i]
            ## calculate the probability of the trigram

            m0 = uniform_prob
            m1 = unigram_cond_probs.get(wi, 0)
            m2 = bigram_cond_probs_transformed[wi_1].get(wi, 0) if (wi_1 in bigram_cond_probs_transformed) else uniform_prob
            m3 = trigram_cond_probs_transformed[(wi_2, wi_1)].get(wi, 0) if ((wi_2, wi_1) in trigram_cond_probs_transformed) else uniform_prob

            current_prob = 0
            current_prob += lambdas[0] * m0
            current_prob += lambdas[1] * m1
            current_prob += lambdas[2] * m2
            current_prob += lambdas[3] * m3

            ## calculate the expected counts
            expected_counts[0] += lambdas[0] * m0 / current_prob
            expected_counts[1] += lambdas[1] * m1 / current_prob
            expected_counts[2] += lambdas[2] * m2 / current_prob
            expected_counts[3] += lambdas[3] * m3 / current_prob

        ## normalize the expected counts
        total_counts = sum(expected_counts)
        lambdas = [count / total_counts for count in expected_counts]

        ## check for convergence
        if all([abs(old - new) < epsilon for new, old in zip(lambdas, prev_lambdas)]):
            break

        prev_lambdas = lambdas[:]

    return lambdas

from math import log2

def get_cross_entropy(
        text : List[str],
        uniform_prob : float,
        unigram_cond_probs : Dict[Tuple[str], float],
        bigram_cond_probs : Dict[Tuple[str, str], float],
        trigram_cond_probs : Dict[Tuple[str, str, str], float],
        lambdas : List[float]
    ) -> float:
    cross_entropy = 0
    for i in range(len(text) - 2):
        wi_2, wi_1, wi = text[i-2], text[i-1], text[i]

        current_prob = 0
        current_prob += lambdas[0] * uniform_prob
        current_prob += lambdas[1] * unigram_cond_probs.get(wi, 0)
        current_prob += lambdas[2] * bigram_cond_probs.get((wi_1, wi), 0)
        current_prob += lambdas[3] * trigram_cond_probs.get((wi_2, wi_1, wi), 0)

        cross_entropy -= log2(current_prob)

    return cross_entropy / (len(text) - 2)

optimal_lambdas = {
    text : em_algo(datasets[text]["heldout"], 
                   ngram_probs[text][0],
                   ngram_probs[text][1], 
                   ngram_probs[text][2], 
                   ngram_probs[text][3], 
                   [0.25, 0.25, 0.25, 0.25])
    for text in texts
}

false_training_lambdas = {
    text : em_algo(datasets[text]["train"], 
                   ngram_probs[text][0],
                   ngram_probs[text][1], 
                   ngram_probs[text][2], 
                   ngram_probs[text][3], 
                   [0.25, 0.25, 0.25, 0.25])
    for text in texts
}

for text in texts:
    lambdas = optimal_lambdas[text]
    print(f"Smoothing parameters for {text}: ")
    for i, lambda_ in enumerate(lambdas):
        print(f"    lambda_{i}: {lambda_*100:.3f}%")

    cross_entropy = get_cross_entropy(datasets[text]['test'], ngram_probs[text][0], ngram_probs[text][1], ngram_probs[text][2], ngram_probs[text][3], lambdas)
    print(f"Cross-entropy for {text}: {cross_entropy:.3f}")
    print()

Smoothing parameters for TEXTCZ1.txt: 
    lambda_0: 14.024%
    lambda_1: 42.892%
    lambda_2: 24.460%
    lambda_3: 18.624%
Cross-entropy for TEXTCZ1.txt: 10.443

Smoothing parameters for TEXTEN1.txt: 
    lambda_0: 6.967%
    lambda_1: 25.407%
    lambda_2: 49.207%
    lambda_3: 18.418%
Cross-entropy for TEXTEN1.txt: 7.572

for text in texts:
    lambdas = false_training_lambdas[text]
    print(f"Smoothing parameters for {text}: ")
    for i, lambda_ in enumerate(lambdas):
        print(f"    lambda_{i}: {lambda_*100:.3f}%")

    cross_entropy = get_cross_entropy(datasets[text]['test'], ngram_probs[text][0], ngram_probs[text][1], ngram_probs[text][2], ngram_probs[text][3], lambdas)
    print(f"Cross-entropy for {text}: {cross_entropy:.3f}")
    print()

Smoothing parameters for TEXTCZ1.txt: 
    lambda_0: 0.000%
    lambda_1: 0.001%
    lambda_2: 0.001%
    lambda_3: 99.998%
Cross-entropy for TEXTCZ1.txt: 36.181

Smoothing parameters for TEXTEN1.txt: 
    lambda_0: 0.001%
    lambda_1: 0.000%
    lambda_2: 0.001%
    lambda_3: 99.998%
Cross-entropy for TEXTEN1.txt: 18.150

offsets : List[float] = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]

trigram_boost : Dict[str, Dict[float, float]] = {}

for text in texts:
    trigram_boost[text] = {}
    
    trigram_lambda = optimal_lambdas[text][3]
    trigram_diff = 1 - trigram_lambda

    for offset in offsets:
        remaining_lambdas_mass = sum(optimal_lambdas[text][:3])
        remaining_mass = 1 - (trigram_lambda+trigram_diff*offset)
        lambdas = [lambda_ * remaining_mass / remaining_lambdas_mass for lambda_ in optimal_lambdas[text][:3]]+[trigram_lambda+trigram_diff*offset]

        if (sum(lambdas) < 0.9999):
            print(f"Error: lambdas sum to {sum(lambdas)}")

        cross_entropy = get_cross_entropy(datasets[text]['test'], ngram_probs[text][0], ngram_probs[text][1], ngram_probs[text][2], ngram_probs[text][3], lambdas)
        trigram_boost[text][offset] = cross_entropy

offsets : List[float] = [0.99, 0.95, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0]

lowergram_boost : Dict[str, Dict[float, float]] = {}

for text in texts:
    lowergram_boost[text] = {}

    trigram_lambda = optimal_lambdas[text][3]
    
    for offset in offsets:
        new_trigram_lambda = trigram_lambda*offset
        remaining_mass = 1 - new_trigram_lambda
        lambdas = [lambda_ * remaining_mass / (1 - trigram_lambda) for lambda_ in optimal_lambdas[text][:3]]+[new_trigram_lambda]

        if (sum(lambdas) < 0.9999):
            print(f"Error: lambdas sum to {sum(lambdas)}")

        cross_entropy = get_cross_entropy(datasets[text]['test'], ngram_probs[text][0], ngram_probs[text][1], ngram_probs[text][2], ngram_probs[text][3], lambdas)
        lowergram_boost[text][offset] = cross_entropy

import matplotlib.pyplot as plt

# Plot settings
plt.figure(figsize=(12, 6))

# Loop over each text in the dictionaries
for text in texts:
    offsets_trigram = list(trigram_boost[text].keys())
    cross_entropies_trigram = list(trigram_boost[text].values())
    
    offsets_lowergram = list(lowergram_boost[text].keys())
    cross_entropies_lowergram = list(lowergram_boost[text].values())
    
    # Plot trigram_boost cross-entropy
    plt.plot(offsets_trigram, cross_entropies_trigram, marker='o', label=f'{text} Trigram Boost')
    
    # Plot lowergram_boost cross-entropy
    plt.plot(offsets_lowergram, cross_entropies_lowergram, marker='x', label=f'{text} Lowergram Boost')

# Adding labels and title
plt.xlabel('Offset')
plt.ylabel('Cross Entropy')
plt.title('Cross Entropy Across Offsets for Trigram and Lowergram Boost')
plt.legend()
plt.grid(True)
plt.show()

import pandas as pd

# Create DataFrame for trigram_boost values
trigram_boost_df = pd.DataFrame(trigram_boost).T  # Transpose to have texts as rows, offsets as columns
trigram_boost_df.columns = [f'Offset {offset}' for offset in trigram_boost_df.columns]
trigram_boost_df.index.name = 'Text'

# Create DataFrame for lowergram_boost values
lowergram_boost_df = pd.DataFrame(lowergram_boost).T  # Transpose to have texts as rows, offsets as columns
lowergram_boost_df.columns = [f'Offset {offset}' for offset in lowergram_boost_df.columns]
lowergram_boost_df.index.name = 'Text'

# Display the DataFrames
print("Trigram Boost Cross-Entropy Table:")
display(trigram_boost_df)

print("\nLowergram Boost Cross-Entropy Table:")
display(lowergram_boost_df)

Trigram Boost Cross-Entropy Table:

Lowergram Boost Cross-Entropy Table:

	Prob.	Min E.	Max E.	Avg E.	Min P.	Max P.	Avg P.
0	0.20000	4.47	4.49	4.48	22.14	22.41	22.27
1	0.10000	4.63	4.64	4.64	24.81	25.01	24.91
2	0.05000	4.70	4.70	4.70	25.96	26.05	25.99
3	0.01000	4.74	4.74	4.74	26.69	26.73	26.70
4	0.00100	4.75	4.75	4.75	26.85	26.87	26.85
5	0.00010	4.75	4.75	4.75	26.86	26.87	26.87
6	0.00001	4.75	4.75	4.75	26.87	26.87	26.87

	Prob.	Min E.	Max E.	Avg E.	Min P.	Max P.	Avg P.
0	0.20000	3.53	3.55	3.54	11.58	11.71	11.62
1	0.10000	4.00	4.01	4.01	16.02	16.12	16.08
2	0.05000	4.33	4.34	4.34	20.15	20.29	20.21
3	0.01000	4.66	4.66	4.66	25.20	25.32	25.25
4	0.00100	4.74	4.74	4.74	26.69	26.70	26.69
5	0.00010	4.75	4.75	4.75	26.85	26.86	26.85
6	0.00001	4.75	4.75	4.75	26.87	26.87	26.87

	Prob.	Min E.	Max E.	Avg E.	Min P.	Max P.	Avg P.
0	0.20000	5.56	5.57	5.56	47.20	47.51	47.34
1	0.10000	5.46	5.46	5.46	43.91	44.12	43.99
2	0.05000	5.38	5.39	5.38	41.56	41.86	41.68
3	0.01000	5.30	5.31	5.31	39.53	39.63	39.59
4	0.00100	5.29	5.29	5.29	39.10	39.12	39.11
5	0.00010	5.29	5.29	5.29	39.06	39.07	39.06
6	0.00001	5.29	5.29	5.29	39.06	39.06	39.06

	Prob.	Min E.	Max E.	Avg E.	Min P.	Max P.	Avg P.
0	0.20000	4.02	4.04	4.03	16.23	16.45	16.32
1	0.10000	4.72	4.74	4.73	26.44	26.69	26.56
2	0.05000	5.05	5.06	5.06	33.18	33.39	33.30
3	0.01000	5.25	5.25	5.25	37.96	38.14	38.05
4	0.00100	5.28	5.28	5.28	38.93	38.98	38.96
5	0.00010	5.29	5.29	5.29	39.04	39.05	39.05
6	0.00001	5.29	5.29	5.29	39.05	39.06	39.06

	Offset 0.0	Offset 0.1	Offset 0.2	Offset 0.3	Offset 0.4	Offset 0.5	Offset 0.6	Offset 0.7	Offset 0.8	Offset 0.9	Offset 0.95	Offset 0.99
Text
TEXTCZ1.txt	10.394591	10.406147	10.474226	10.579072	10.718923	10.899240	11.133112	11.447540	11.905063	12.707247	13.521420	15.428067
TEXTEN1.txt	7.560139	7.571013	7.613759	7.683418	7.780816	7.910988	8.084625	8.323388	8.677306	9.307598	9.953423	11.474359

Entropy and Language Modeling (homework 1)¶

Entropy of a text¶

Interpretation¶

Mess it up!¶

Hypotesis¶

Interpretation¶

Why messing characters does not work?¶

Interpretation¶

Cross Entropy and Language Modeling¶

Interpretation¶

Playing around with lambdas¶

Interpretation¶

	Offset 0.99	Offset 0.95	Offset 0.9	Offset 0.8	Offset 0.7	Offset 0.6	Offset 0.5	Offset 0.4	Offset 0.3	Offset 0.2	Offset 0.1	Offset 0.0
Text
TEXTCZ1.txt	10.394853	10.395974	10.397545	10.401313	10.406043	10.411941	10.419303	10.428586	10.440555	10.456712	10.480960	10.551031
TEXTEN1.txt	7.560288	7.560996	7.562145	7.565397	7.570077	7.576439	7.584839	7.595821	7.610277	7.629889	7.658753	7.723425