-
Notifications
You must be signed in to change notification settings - Fork 37
/
lecture_12-content.js
172 lines (172 loc) · 36.5 KB
/
lecture_12-content.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 15}], "Last lecture: overview of datasets used for training LMs", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 16}], "Live service -> dump/crawl -> processed data", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 17}], "Processing: HTML->text, language/quality/toxicity filtering, deduplication", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 19}], "This lecture:", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 20}], "- Algorithms for filtering (e.g., classifiers)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 21}], "- Applications of filtering (e.g., language, quality, toxicity)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 22}], "- Stare at some datasets (if we have time)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 24}], "## Algorithms", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 26}], "Algorithmic building block: given some target data T and lots of raw data R, find subset of R similar to T", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 30}], "Desiderata for filtering algorithm:", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 31}], "- Generalize from the target data", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 32}], "- Extremely fast", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 54}], "n-gram model with Kneser-Ney smoothing", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 54}], "https://en.wikipedia.org/wiki/Kneser%E2%80%93Ney_smoothing", {"color": "gray"})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 56}], "KenLM: fast implementation originally for machine translation", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 56}], "https://kheafield.com/code/kenlm/", {"color": "gray"})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 57}], "Common language model used for data filtering", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 58}], "Extremely simple / fast - just count and normalize", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 60}], "## Key ingredients", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 62}], "Maximum likelihood estimation of n-gram language model: p(in | the cat) = count(the cat in) / count(the cat)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 65}], "Interpolation: p(in | the cat) = (1 - \u03bb(the cat)) * count(the cat in) / count(the cat) + \u03bb(the cat) * p(in | cat)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 69}], "Discounting (motivation: Good-Turing estimate for cracking German ciphers during WWII): p(in | the cat) = (count(the cat in) - d) / count(the cat) + \u03bb(the cat) * p(in | cat)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 73}], "Motivation: p(Francisco) is large, but mostly because of 'San Francisco'", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 74}], "Thus, we should not use count(Francisco), but instead number of unique contexts (So San Francisco counts once): |{ w: count(w Francisco) }|", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 77}], "Kneser-Ney smoothing: p(Francisco) = |{ w : count(w Francisco) }| / |{ w w': count(w w') > 0 }|", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 100}, {"name": "print_perplexity", "filename": "lecture_12.py", "lineno": 98}], "log p(<s> Stanford University was founded in 1885 by Leland and Jane Stanford . </s>) = -75.7110824584961, perplexity = 155.61818844501636", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 101}, {"name": "print_perplexity", "filename": "lecture_12.py", "lineno": 98}], "log p(<s> University Stanford founded was 1885 in Leland by and Stanford Jane . </s>) = -75.71109008789062, perplexity = 155.6182675965402", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 102}, {"name": "print_perplexity", "filename": "lecture_12.py", "lineno": 98}], "log p(<s> Stanford University was founded in 1885 by Leland and Jane Stanford , dedicated to the memory of Leland Stanford Jr . , their only child . </s>) = -149.6759796142578, perplexity = 174.38067142603157", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 103}, {"name": "print_perplexity", "filename": "lecture_12.py", "lineno": 98}], "log p(<s> The quick brown fox jumps over the lazy dog . </s>) = -61.6954345703125, perplexity = 115.1001546400582", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 104}, {"name": "print_perplexity", "filename": "lecture_12.py", "lineno": 98}], "log p(<s> the the the the the </s>) = -31.118854522705078, perplexity = 48.90388373634129", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 105}, {"name": "print_perplexity", "filename": "lecture_12.py", "lineno": 98}], "log p(<s> asdf asdf asdf asdf asdf </s>) = -44.84880447387695, perplexity = 272.08120219784354", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 107}], "## CCNet", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 107}], "https://arxiv.org/pdf/1911.00359", {"color": "gray"})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 108}], "- Items are paragraphs of text", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 109}], "- Sort paragraphs by perplexity", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 110}], "- Keep the top 1/3", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 34}, {"name": "kenlm_main", "filename": "lecture_12.py", "lineno": 112}], "Summary: Kneser-Ney language models (fast), KenLM is fast implementation", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 35}, {"name": "fasttext_main", "filename": "lecture_12.py", "lineno": 116}], "fastText classifier [Joulin+ 2016]", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 35}, {"name": "fasttext_main", "filename": "lecture_12.py", "lineno": 116}], "https://arxiv.org/pdf/1607.01759", {"color": "gray"})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 35}, {"name": "fasttext_main", "filename": "lecture_12.py", "lineno": 118}], "Popular choice for language model data filtering due to convenience", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 35}, {"name": "fasttext_main", "filename": "lecture_12.py", "lineno": 120}], "Task: text classification (e.g., sentiment classification)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 35}, {"name": "fasttext_main", "filename": "lecture_12.py", "lineno": 121}], "Goal was to train a fast classifier", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 35}, {"name": "fasttext_main", "filename": "lecture_12.py", "lineno": 122}], "Found was as good as much slower neural network classifiers", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 35}, {"name": "fasttext_main", "filename": "lecture_12.py", "lineno": 124}], "## Baseline: bag of words", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 35}, {"name": "fasttext_main", "filename": "lecture_12.py", "lineno": 131}], "Problem: V * K parameters (could be huge)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 35}, {"name": "fasttext_main", "filename": "lecture_12.py", "lineno": 133}], "## fastText classifier: bag of word embeddings", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 35}, {"name": "fasttext_main", "filename": "lecture_12.py", "lineno": 138}], "Only H (V + K) parameters", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 35}, {"name": "fasttext_main", "filename": "lecture_12.py", "lineno": 140}], "Parallelized, asynchronous SGD", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 35}, {"name": "fasttext_main", "filename": "lecture_12.py", "lineno": 141}], "Learning rate: linear interpolation from [some number] to 0", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 35}, {"name": "fasttext_main", "filename": "lecture_12.py", "lineno": 141}], "https://github.com/facebookresearch/fastText/blob/main/src/fasttext.cc#L653", {"color": "gray"})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 35}, {"name": "fasttext_main", "filename": "lecture_12.py", "lineno": 143}], "## Bag of n-grams", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 35}, {"name": "fasttext_main", "filename": "lecture_12.py", "lineno": 145}], "Number of bigrams can get large (and also be unbounded)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 35}, {"name": "fasttext_main", "filename": "lecture_12.py", "lineno": 147}], "Hashing trick", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 35}, {"name": "fasttext_main", "filename": "lecture_12.py", "lineno": 151}], "For 2 classes, this is just a linear classifier", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 36}, {"name": "dsir", "filename": "lecture_12.py", "lineno": 155}], "## Data Selection for Language Models via Importance Resampling (DSIR) [Xie+ 2023]", {})
addImage([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 36}, {"name": "dsir", "filename": "lecture_12.py", "lineno": 156}], "https://www.jinghong-chen.net/content/images/size/w1200/2023/12/Screenshot-2023-12-24-at-17.41.38.png", {"width": "50.0%"})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 36}, {"name": "dsir", "filename": "lecture_12.py", "lineno": 158}], "## Importance resampling", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 36}, {"name": "dsir", "filename": "lecture_12.py", "lineno": 160}], "Setup:", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 36}, {"name": "dsir", "filename": "lecture_12.py", "lineno": 161}], "- Target distribution p (want samples from here)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 36}, {"name": "dsir", "filename": "lecture_12.py", "lineno": 162}], "- Proposal distribution q (have samples from here)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 36}, {"name": "dsir", "filename": "lecture_12.py", "lineno": 170}], "Samples (q): [1 2 0 0 1 0 0 2 2 1 1 1 1 2 0 1 1 1 1 0 0 2 0 2 0 0 0 0 0 0 2 0 1 2 1 0 2\n 0 1 2 0 1 0 0 0 1 0 0 1 0 1 0 0 0 0 2 2 0 0 0 3 0 3 0 0 0 1 3 2 2 0 1 0 2\n 0 0 0 1 2 3 2 1 3 0 2 0 2 0 2 0 0 0 1 3 0 3 2 1 0 1]", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 36}, {"name": "dsir", "filename": "lecture_12.py", "lineno": 179}], "Resampled (p): [0 2 0 2 3 2 1 2 2 1 1 0 2 3 3 3 1 0 1 1 2 2 0 2 0 1 2 2 2 2 1 2 2 2 0 2 1\n 2 1 0 2 2 3 2 3 3 2 1 3 2 1 1 3 2 3 3 2 1 2 3 2 3 0 3 1 2 0 3 1 2 2 3 2 2\n 3 2 2 3 3 3 1 2 0 3 3 3 0 3 3 1 2 3 2 1 2 0 1 3 2 3]", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 36}, {"name": "dsir", "filename": "lecture_12.py", "lineno": 181}], "## Hashed n-grams", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 36}, {"name": "dsir", "filename": "lecture_12.py", "lineno": 183}], "Setup:", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 36}, {"name": "dsir", "filename": "lecture_12.py", "lineno": 184}], "- Target dataset D_p (small)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 36}, {"name": "dsir", "filename": "lecture_12.py", "lineno": 185}], "- Proposal (raw) dataset D_q (large)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 36}, {"name": "dsir", "filename": "lecture_12.py", "lineno": 187}], "First thought:", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 36}, {"name": "dsir", "filename": "lecture_12.py", "lineno": 188}], "- Fit a distribution p to D_p", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 36}, {"name": "dsir", "filename": "lecture_12.py", "lineno": 189}], "- Fit a distribution q to D_q", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 36}, {"name": "dsir", "filename": "lecture_12.py", "lineno": 190}], "- Do importance resampling on D_q", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 36}, {"name": "dsir", "filename": "lecture_12.py", "lineno": 192}], "Problem: |D_p| is too small to estimate a good model", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 36}, {"name": "dsir", "filename": "lecture_12.py", "lineno": 194}], "Solution: use hashed n-grams", {})
addImage([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 36}, {"name": "dsir", "filename": "lecture_12.py", "lineno": 225}], "https://neurips.cc/media/PosterPDFs/NeurIPS%202023/70154.png?t=1701377065.5253515", {"width": "100.0%"})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 36}, {"name": "dsir", "filename": "lecture_12.py", "lineno": 226}], "Result: DSIR slightly better than heuristic classification (fastText)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 37}, {"name": "filtering_summary", "filename": "lecture_12.py", "lineno": 230}], "Implementations: KenLM, fastText, DSIR", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 37}, {"name": "filtering_summary", "filename": "lecture_12.py", "lineno": 232}], "General framework", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 37}, {"name": "filtering_summary", "filename": "lecture_12.py", "lineno": 233}], "Given target T and raw R, find subset of R similar to T", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 37}, {"name": "filtering_summary", "filename": "lecture_12.py", "lineno": 234}], "Two pieces", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 37}, {"name": "filtering_summary", "filename": "lecture_12.py", "lineno": 235}], "1. Estimate a score (some model)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 37}, {"name": "filtering_summary", "filename": "lecture_12.py", "lineno": 236}], "2. Keep examples based on the score", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 37}, {"name": "filtering_summary", "filename": "lecture_12.py", "lineno": 238}], "Generative model of T (KenLM)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 37}, {"name": "filtering_summary", "filename": "lecture_12.py", "lineno": 239}], "1. score(x) = p_T(x)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 37}, {"name": "filtering_summary", "filename": "lecture_12.py", "lineno": 240}], "2. Keep examples x with score(x) >= threshold", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 37}, {"name": "filtering_summary", "filename": "lecture_12.py", "lineno": 242}], "Discriminative classifier (fastText)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 37}, {"name": "filtering_summary", "filename": "lecture_12.py", "lineno": 243}], "1. score(x) = p(T | x)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 37}, {"name": "filtering_summary", "filename": "lecture_12.py", "lineno": 244}], "2. Keep examples x with score(x) >= threshold (stochastically)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 37}, {"name": "filtering_summary", "filename": "lecture_12.py", "lineno": 246}], "Impotance resampling (DSIR)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 37}, {"name": "filtering_summary", "filename": "lecture_12.py", "lineno": 247}], "1. score(x) = p_T(x) / q_T(x)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 37}, {"name": "filtering_summary", "filename": "lecture_12.py", "lineno": 248}], "2. Resample examples x with probability proportional to score(x)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 262}], "Language identification: find text of a specific language (e.g., English)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 264}], "Why not go multilingual?", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 265}], "- Data: difficult to do curation / processing of high-quality data in any given language", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 266}], "- Compute: in computed-limited regime, less compute/tokens dedicated to any given language", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 267}], "English was only 30% of BLOOM, English performance suffered", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 267}], "https://arxiv.org/pdf/2303.03915", {"color": "gray"})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 268}], "Chinese models (Yi, Qwen, DeepSeek) are mostly English/Chinese", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 269}], "GPT-4, Claude, Gemini are all multilingual", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 271}], "Language identification via fastText", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 272}], "https://fasttext.cc/docs/en/language-identification.html", {"color": "gray"})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 274}], "Supports 176 languages", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 275}], "Trained on multilingual sites: Wikipedia, Tatoeba (translation site) and SETimes (Southeast European news)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 277}], "Dolma keeps pages with p(English) >= 0.5", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 286}, {"name": "print_predict", "filename": "lecture_12.py", "lineno": 393}], "The quick brown fox jumps over the lazy dog. => __label__en [0.71621013]", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 287}, {"name": "print_predict", "filename": "lecture_12.py", "lineno": 393}], "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. => __label__en [0.71852088]", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 288}, {"name": "print_predict", "filename": "lecture_12.py", "lineno": 393}], "OMG that movie was \ud83d\udd25\ud83d\udd25! So dope \ud83d\ude0e\ud83e\udd18! => __label__en [0.99213725]", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 289}, {"name": "print_predict", "filename": "lecture_12.py", "lineno": 393}], "Auf dem Wasser zu singen => __label__de [0.97517276]", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 290}, {"name": "print_predict", "filename": "lecture_12.py", "lineno": 393}], "The quadratic formula is $x = \\frac{-b \\pm \\sqrt{b^2 - 4ac}}{2a}$. => __label__en [0.48865604]", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 291}, {"name": "print_predict", "filename": "lecture_12.py", "lineno": 393}], "for (int i = 0; i < 10; i++) => __label__ru [0.24752462]", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 292}, {"name": "print_predict", "filename": "lecture_12.py", "lineno": 393}], "hello => __label__en [0.30061391]", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 293}, {"name": "print_predict", "filename": "lecture_12.py", "lineno": 393}], "bonjour => __label__fr [0.96965522]", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 294}, {"name": "print_predict", "filename": "lecture_12.py", "lineno": 393}], "Feliz Navidad / Pr\u00f3spero a\u00f1o y felicidad / I wanna wish you a Merry Christmas => __label__es [0.93549502]", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 296}], "Caveats:", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 297}], "- Difficult for short sequences", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 298}], "- Difficult for low-resource languages", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 299}], "- Could accidentally filter out dialects of English", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 300}], "- Hard for similar languages (Malay and Indoneisan)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 301}], "- Ill-defined for code-switching (e.g., Spanish + English)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 303}], "## OpenMathText", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 303}], "https://arxiv.org/pdf/2310.06786", {"color": "gray"})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 304}], "Goal: curate large corpus of mathematical text from CommonCrawl", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 305}], "- Use rules to filter (e.g., contains latex commands)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 306}], "- KenLM trained on ProofPile, keep if perplexity > 15000", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 307}], "- Trained fastText classifier to predict mathematical writing, threshold is 0.17 if math, 0.8 if no math", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 39}, {"name": "language_identification", "filename": "lecture_12.py", "lineno": 308}], "Result: 14.7B tokens, 1.4B models do better than models trained on 20x data", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 312}], "Some deliberately do not used model-based filtering (C4, Gopher, RefinedWeb, FineWeb, Dolma)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 313}], "Some use model-based filtering (GPT-3, LLaMA)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 316}], "## GPT-3", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 316}], "https://arxiv.org/pdf/2005.14165", {"color": "gray"})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 317}], "- Positives: samples from {Wikipedia, WebText, Books1, Books2}", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 318}], "- Negatives: samples from CommonCrawl", {})
addImage([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 319}], "https://upload.wikimedia.org/wikipedia/commons/thumb/1/11/Probability_density_function_of_Pareto_distribution.svg/325px-Probability_density_function_of_Pareto_distribution.svg.png", {"width": "50.0%"})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 320}], "Train linear classifier based on word features", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 320}], "https://spark.apache.org/docs/latest/ml-features#tokenizer", {"color": "gray"})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 321}], "Keep documents stochastically based on score", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 324}], 0.0, {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 325}], 1.0, {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 328}], "## LLaMA/RedPajama", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 328}], "https://arxiv.org/pdf/2302.13971", {"color": "gray"})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 329}], "- Positives: samples from pages referenced by Wikipedia", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 329}], "https://en.wikipedia.org/wiki/Sphinx", {"color": "gray"})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 330}], "- Negatives: samples from CommonCrawl", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 331}], "Keep documents that are classified positive", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 334}], "## phi-1 [Gunasekara+ 2023 (Microsoft)]", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 334}], "https://arxiv.org/pdf/2306.11644", {"color": "gray"})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 335}], "Philosophy: really high quality data (textbooks) to train a small model (1.5B)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 336}], "Includes synthetic data from GPT 3.5 (later: GPT-4) and filtered data", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 341}], "Run GPT-4 on T with prompt to generate positives and negatives", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 342}], "Train random forest classifier using output embedding from pretrained codegen model", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 344}], "Result on HumanEval", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 344}], "https://huggingface.co/datasets/openai_humaneval", {"color": "gray"})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 345}], "- Train 1.3B LM on Python subset of The Stack (performance: 12.19% after 96K steps)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 40}, {"name": "quality_filtering", "filename": "lecture_12.py", "lineno": 346}], "- Train 1.3B LM on filtered subset (performance: 17.68% after 36K steps)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 41}, {"name": "toxicity_filtering", "filename": "lecture_12.py", "lineno": 357}], "## Dolma toxicity filtering", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 41}, {"name": "toxicity_filtering", "filename": "lecture_12.py", "lineno": 359}], "Dataset: Jigsaw Toxic Comments dataset [2018]", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 41}, {"name": "toxicity_filtering", "filename": "lecture_12.py", "lineno": 360}], "https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data", {"color": "gray"})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 41}, {"name": "toxicity_filtering", "filename": "lecture_12.py", "lineno": 361}], "https://www.kaggle.com/datasets/julian3833/jigsaw-toxic-comment-classification-challenge", {"color": "gray"})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 41}, {"name": "toxicity_filtering", "filename": "lecture_12.py", "lineno": 363}], "Project goal: help people have better discussions online", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 41}, {"name": "toxicity_filtering", "filename": "lecture_12.py", "lineno": 363}], "https://www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge/discussion/46064", {"color": "gray"})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 41}, {"name": "toxicity_filtering", "filename": "lecture_12.py", "lineno": 364}], "Data: Wikipedia comments annotated with {toxic, severe_toxic, obscene, threat, insult, identity_hate}", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 41}, {"name": "toxicity_filtering", "filename": "lecture_12.py", "lineno": 366}], "Trained 2 fastText classifiers", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 41}, {"name": "toxicity_filtering", "filename": "lecture_12.py", "lineno": 367}], "- hate: positive = {unlabeled, obscene}, negative = all else", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 41}, {"name": "toxicity_filtering", "filename": "lecture_12.py", "lineno": 368}], "- NSFW: positive = {obscene}, negative = all else", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 41}, {"name": "toxicity_filtering", "filename": "lecture_12.py", "lineno": 383}, {"name": "print_predict", "filename": "lecture_12.py", "lineno": 393}], "Are you threatening me for disputing neutrality? I know in your country it's quite common to bully your way through a discussion and push outcomes you want. But this is not Russia. => __label__non-nsfw [0.99995422]", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 41}, {"name": "toxicity_filtering", "filename": "lecture_12.py", "lineno": 384}, {"name": "print_predict", "filename": "lecture_12.py", "lineno": 393}], "Stupid peace of shit stop deleting my stuff asshole go die and fall in a hole go to hell! => __label__nsfw [1.00001001]", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 41}, {"name": "toxicity_filtering", "filename": "lecture_12.py", "lineno": 385}, {"name": "print_predict", "filename": "lecture_12.py", "lineno": 393}], "I love strawberries => __label__non-nsfw [1.00001001]", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 41}, {"name": "toxicity_filtering", "filename": "lecture_12.py", "lineno": 386}, {"name": "print_predict", "filename": "lecture_12.py", "lineno": 393}], "I hate strawberries => __label__non-nsfw [1.00001001]", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 43}], "## FineWeb", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 44}], "15T tokens (Common Crawl with C4/Gopher filtering, fuzzy deduplication, PII removal)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 45}], "https://huggingface.co/datasets/HuggingFaceFW/fineweb", {"color": "gray"})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 47}], "## Summary", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 48}], "- Algorithmic tools: n-gram models (KenLM), classifiers (fastText), importance resampling (DSIR)", {})
addText([{"name": "lecture_12", "filename": "lecture_12.py", "lineno": 49}], "- Applications: language identification, quality filtering, toxicity filtering", {})