[{"data":1,"prerenderedAt":1237},["ShallowReactive",2],{"navigation_docs_en":3,"\u002Fen\u002Fai-engineering\u002Funderstanding-foundation-models\u002Fch02-1-training-data":77,"\u002Fen\u002Fai-engineering\u002Funderstanding-foundation-models\u002Fch02-1-training-data-surround":1232},[4],{"title":5,"icon":6,"path":7,"stem":8,"children":9,"page":45},"AI Engineering",null,"\u002Fen\u002Fai-engineering","en\u002F1.ai-engineering",[10,46],{"title":11,"icon":12,"path":13,"stem":14,"children":15,"page":45},"Introduction to Building AI Applications with Foundation Models","i-lucide-brain-circuit","\u002Fen\u002Fai-engineering\u002Fintro","en\u002F1.ai-engineering\u002F1.intro",[16,20,25,30,35,40],{"title":11,"path":17,"stem":18,"icon":19},"\u002Fen\u002Fai-engineering\u002Fintro\u002Fch01","en\u002F1.ai-engineering\u002F1.intro\u002Fch01","i-lucide-sparkles",{"title":21,"path":22,"stem":23,"icon":24},"The Rise of AI Engineering","\u002Fen\u002Fai-engineering\u002Fintro\u002Fch011-the-rise-of-ai-engineering","en\u002F1.ai-engineering\u002F1.intro\u002Fch011-the-rise-of-ai-engineering","i-lucide-history",{"title":26,"path":27,"stem":28,"icon":29},"Foundation Model Use Cases","\u002Fen\u002Fai-engineering\u002Fintro\u002Fch012-foundation-model-use-cases","en\u002F1.ai-engineering\u002F1.intro\u002Fch012-foundation-model-use-cases","i-lucide-layout-grid",{"title":31,"path":32,"stem":33,"icon":34},"Planning AI Applications","\u002Fen\u002Fai-engineering\u002Fintro\u002Fch013-planning-ai-applications","en\u002F1.ai-engineering\u002F1.intro\u002Fch013-planning-ai-applications","i-lucide-clipboard-list",{"title":36,"path":37,"stem":38,"icon":39},"The AI Engineering Stack","\u002Fen\u002Fai-engineering\u002Fintro\u002Fch014-the-ai-engineering-stack","en\u002F1.ai-engineering\u002F1.intro\u002Fch014-the-ai-engineering-stack","i-lucide-layers",{"title":41,"path":42,"stem":43,"icon":44},"Summary","\u002Fen\u002Fai-engineering\u002Fintro\u002Fch015-summary","en\u002F1.ai-engineering\u002F1.intro\u002Fch015-summary","i-lucide-flag",false,{"title":47,"icon":6,"path":48,"stem":49,"children":50,"page":45},"Understanding Foundation Models","\u002Fen\u002Fai-engineering\u002Funderstanding-foundation-models","en\u002F1.ai-engineering\u002F2.understanding-foundation-models",[51,54,59,64,69,74],{"title":47,"path":52,"stem":53,"icon":12},"\u002Fen\u002Fai-engineering\u002Funderstanding-foundation-models\u002Fch02","en\u002F1.ai-engineering\u002F2.understanding-foundation-models\u002Fch02",{"title":55,"path":56,"stem":57,"icon":58},"Training Data","\u002Fen\u002Fai-engineering\u002Funderstanding-foundation-models\u002Fch02-1-training-data","en\u002F1.ai-engineering\u002F2.understanding-foundation-models\u002Fch02-1-training-data","i-lucide-database",{"title":60,"path":61,"stem":62,"icon":63},"Modeling","\u002Fen\u002Fai-engineering\u002Funderstanding-foundation-models\u002Fch02-2-modeling","en\u002F1.ai-engineering\u002F2.understanding-foundation-models\u002Fch02-2-modeling","i-lucide-network",{"title":65,"path":66,"stem":67,"icon":68},"Post-Training","\u002Fen\u002Fai-engineering\u002Funderstanding-foundation-models\u002Fch02-3-post-training","en\u002F1.ai-engineering\u002F2.understanding-foundation-models\u002Fch02-3-post-training","i-lucide-sliders-horizontal",{"title":70,"path":71,"stem":72,"icon":73},"Sampling","\u002Fen\u002Fai-engineering\u002Funderstanding-foundation-models\u002Fch02-4-sampling","en\u002F1.ai-engineering\u002F2.understanding-foundation-models\u002Fch02-4-sampling","i-lucide-dices",{"title":41,"path":75,"stem":76,"icon":44},"\u002Fen\u002Fai-engineering\u002Funderstanding-foundation-models\u002Fch02-5-summary","en\u002F1.ai-engineering\u002F2.understanding-foundation-models\u002Fch02-5-summary",{"id":78,"title":55,"body":79,"description":1226,"extension":1227,"links":6,"meta":1228,"navigation":1229,"path":56,"seo":1230,"stem":57,"__hash__":1231},"docs_en\u002Fen\u002F1.ai-engineering\u002F2.understanding-foundation-models\u002Fch02-1-training-data.md",{"type":80,"value":81,"toc":1212},"minimark",[82,92,97,101,120,124,141,145,164,168,177,181,184,188,199,202,222,225,229,241,247,483,486,491,686,690,693,719,726,731,734,740,744,747,750,754,757,776,785,796,800,803,812,837,840,844,864,885,889,909,923,929,934,946,955,960,1072,1076,1079,1091,1094,1130,1133,1136],[83,84,85,89],"u-page-hero",{},[86,87,55],"template",{"v-slot:title":88},"",[86,90,91],{"v-slot:description":88},"An AI model is only as good as the data it was trained on. Training data determines what a model can do, where it struggles, and which applications it can support reliably.",[93,94,96],"h2",{"id":95},"data-sets-the-boundary","Data Sets the Boundary",[98,99,100],"p",{},"If there's no Vietnamese in the training data, the model won't be able to translate from English into Vietnamese. Similarly, if an image classification model sees only animals in its training set, it won't perform well on photos of plants.",[102,103,104,110,115],"card-group",{},[105,106,109],"card",{"icon":107,"title":108},"i-lucide-target","More Task Data Can Help","If you want a model to improve on a certain task, you might want to include more data for that task in the training data.",[105,111,114],{"icon":112,"title":113},"i-lucide-coins","Collection Is Expensive","Collecting sufficient data for training a large model isn't easy, and it can be expensive.",[105,116,119],{"icon":117,"title":118},"i-lucide-archive","Available Data Shapes Models","Model developers often have to rely on available data, even if this data doesn't exactly meet their needs.",[93,121,123],{"id":122},"common-crawl-and-c4","Common Crawl and C4",[98,125,126,127,134,135,140],{},"For example, a common source for training data is ",[128,129,133],"a",{"href":130,"rel":131},"https:\u002F\u002Fcommoncrawl.org\u002F",[132],"nofollow","Common Crawl",", created by a nonprofit organization that sporadically crawls websites on the internet. In 2022 and 2023, this organization crawled approximately 2-3 billion web pages each month. Google provides a clean subset of Common Crawl called the ",[128,136,139],{"href":137,"rel":138},"https:\u002F\u002Farxiv.org\u002Fabs\u002F1910.10683v4",[132],"Colossal Clean Crawled Corpus, or C4 for short",".",[142,143,144],"warning",{},"The data quality of Common Crawl, and C4 to a certain extent, is questionable -- think clickbait, misinformation, propaganda, conspiracy theories, racism, misogyny, and every sketchy website you've ever seen or avoided on the internet.",[98,146,147,148,157,158,163],{},"A ",[128,149,152,153],{"href":150,"rel":151},"https:\u002F\u002Fwww.washingtonpost.com\u002Ftechnology\u002Finteractive\u002F2023\u002Fai-chatbot-learning\u002F",[132],"study by the ",[154,155,156],"em",{},"Washington Post"," shows that the 1,000 most common websites in the dataset include several media outlets that rank low on ",[128,159,162],{"href":160,"rel":161},"https:\u002F\u002Fwww.newsguardtech.com\u002Fsolutions\u002Fnews-reliability-ratings\u002F",[132],"NewsGuard's scale for trustworthiness",". In lay terms, Common Crawl contains plenty of fake news.",[165,166,167],"note",{},"Yet, simply because Common Crawl is available, variations of it are used in most foundation models that disclose their training data sources, including OpenAI's GPT-3 and Google's Gemini. I suspect that Common Crawl is also used in models that don't disclose their training data. To avoid scrutiny from both the public and competitors, many companies have stopped disclosing this information.",[98,169,170,171,176],{},"Some teams use heuristics to filter out low-quality data from the internet. For example, OpenAI used only the Reddit links that received at least three upvotes to train ",[128,172,175],{"href":173,"rel":174},"https:\u002F\u002Fcdn.openai.com\u002Fbetter-language-models\u002Flanguage_models_are_unsupervised_multitask_learners.pdf",[132],"GPT-2",". While this does help screen out links that nobody cares about, Reddit isn't exactly the pinnacle of propriety and good taste.",[93,178,180],{"id":179},"curating-for-the-work-you-need","Curating for the Work You Need",[98,182,183],{},"The \"use what we have, not what we want\" approach may lead to models that perform well on tasks present in the training data but not necessarily on the tasks you care about.",[185,186,187],"tip",{},"To address this issue, it's crucial to curate datasets that align with your specific needs.",[98,189,190,191,194,195,198],{},"This section focuses on curating data for specific ",[154,192,193],{},"languages"," and ",[154,196,197],{},"domains",", providing a broad yet specialized foundation for applications within those areas. Chapter 8 explores data strategies for models tailored to highly specific tasks.",[98,200,201],{},"While language- and domain-specific foundation models can be trained from scratch, it's also common to finetune them on top of general-purpose models.",[203,204,205,211],"accordion",{},[206,207,210],"accordion-item",{"icon":208,"label":209},"i-lucide-circle-help","Why not train on all available data?","Some might wonder, why not just train a model on all data available, both general data and specialized data, so that the model can do everything? This is what many people do. However, training on more data often requires more compute resources and doesn't always lead to better performance.",[206,212,215,216,221],{"icon":213,"label":214},"i-lucide-badge-check","Can smaller, cleaner data win?","For example, a model trained with a smaller amount of high-quality data might outperform a model trained with a large amount of low-quality data. Using 7B tokens of high-quality coding data, ",[128,217,220],{"href":218,"rel":219},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2306.11644",[132],"Gunasekar et al. (2023)"," were able to train a 1.3B-parameter model that outperforms much larger models on several important coding benchmarks.",[98,223,224],{},"The impact of data quality is discussed more in Chapter 8.",[93,226,228],{"id":227},"multilingual-models","Multilingual Models",[98,230,231,232,237,238,140],{},"English dominates the internet. An analysis of the Common Crawl dataset shows that English accounts for almost half of the data (45.88%), making it eight times more ",[128,233,236],{"href":234,"rel":235},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2304.05613",[132],"prevalent than the second-most common language, Russian (5.97%) (Lai et al., 2023)",". See Table 2-1 for a list of languages with at least 1% in Common Crawl. Languages with limited availability as training data -- typically languages not included in this list -- are considered ",[154,239,240],{},"low-resource",[242,243,244],"blockquote",{},[98,245,246],{},"Table 2-1. The most common languages in Common Crawl, a popular dataset for training LLMs. Source: Lai et al. (2023).",[248,249,250,271],"table",{},[251,252,253],"thead",{},[254,255,256,260,263,266,269],"tr",{},[257,258,259],"th",{},"Language",[257,261,262],{},"Code",[257,264,265],{},"Pop.",[257,267,268],{},"CC size",[257,270],{},[272,273,274,290,307,323,339,355,371,387,403,419,435,451,467],"tbody",{},[254,275,276,279,281,284,287],{},[277,278],"td",{},[277,280],{},[277,282,283],{},"(M)",[277,285,286],{},"(%)",[277,288,289],{},"Cat.",[254,291,292,295,298,301,304],{},[277,293,294],{},"English",[277,296,297],{},"en",[277,299,300],{},"1,452",[277,302,303],{},"45.8786",[277,305,306],{},"H",[254,308,309,312,315,318,321],{},[277,310,311],{},"Russian",[277,313,314],{},"ru",[277,316,317],{},"258",[277,319,320],{},"5.9692",[277,322,306],{},[254,324,325,328,331,334,337],{},[277,326,327],{},"German",[277,329,330],{},"de",[277,332,333],{},"134",[277,335,336],{},"5.8811",[277,338,306],{},[254,340,341,344,347,350,353],{},[277,342,343],{},"Chinese",[277,345,346],{},"zh",[277,348,349],{},"1,118",[277,351,352],{},"4.8747",[277,354,306],{},[254,356,357,360,363,366,369],{},[277,358,359],{},"Japanese",[277,361,362],{},"jp",[277,364,365],{},"125",[277,367,368],{},"4.7884",[277,370,306],{},[254,372,373,376,379,382,385],{},[277,374,375],{},"French",[277,377,378],{},"fr",[277,380,381],{},"274",[277,383,384],{},"4.7254",[277,386,306],{},[254,388,389,392,395,398,401],{},[277,390,391],{},"Spanish",[277,393,394],{},"es",[277,396,397],{},"548",[277,399,400],{},"4.4690",[277,402,306],{},[254,404,405,408,411,414,417],{},[277,406,407],{},"Italian",[277,409,410],{},"it",[277,412,413],{},"68",[277,415,416],{},"2.5712",[277,418,306],{},[254,420,421,424,427,430,433],{},[277,422,423],{},"Dutch",[277,425,426],{},"nl",[277,428,429],{},"30",[277,431,432],{},"2.0585",[277,434,306],{},[254,436,437,440,443,446,449],{},[277,438,439],{},"Polish",[277,441,442],{},"pl",[277,444,445],{},"45",[277,447,448],{},"1.6636",[277,450,306],{},[254,452,453,456,459,462,465],{},[277,454,455],{},"Portuguese",[277,457,458],{},"pt",[277,460,461],{},"257",[277,463,464],{},"1.1505",[277,466,306],{},[254,468,469,472,475,478,481],{},[277,470,471],{},"Vietnamese",[277,473,474],{},"vi",[277,476,477],{},"85",[277,479,480],{},"1.0299",[277,482,306],{},[98,484,485],{},"Many other languages, despite having a lot of speakers today, are severely underrepresented in Common Crawl. Table 2-2 shows some of these languages. Ideally, the ratio between world population representation and Common Crawl representation should be 1. The higher this ratio, the more under-represented this language is in Common Crawl.",[242,487,488],{},[98,489,490],{},"Table 2-2. Examples of under-represented languages in Common Crawl. The last row, English, is for comparison. The numbers for % in Common Crawl are taken from Lai et al. (2023).",[248,492,493,521],{},[251,494,495],{},[254,496,497,499,502,515,518],{},[257,498,259],{},[257,500,501],{},"Speakers (million)",[257,503,504,505],{},"% world population",[506,507,508],"sup",{},[128,509,514],{"href":510,"ariaDescribedBy":511,"dataFootnoteRef":88,"id":513},"#user-content-fn-1",[512],"footnote-label","user-content-fnref-1","1",[257,516,517],{},"% in Common Crawl",[257,519,520],{},"World: Common Crawl Ratio",[272,522,523,540,557,574,591,608,625,642,659],{},[254,524,525,528,531,534,537],{},[277,526,527],{},"Punjabi",[277,529,530],{},"113",[277,532,533],{},"1.41%",[277,535,536],{},"0.0061%",[277,538,539],{},"231.56",[254,541,542,545,548,551,554],{},[277,543,544],{},"Swahili",[277,546,547],{},"71",[277,549,550],{},"0.89%",[277,552,553],{},"0.0077%",[277,555,556],{},"115.26",[254,558,559,562,565,568,571],{},[277,560,561],{},"Urdu",[277,563,564],{},"231",[277,566,567],{},"2.89%",[277,569,570],{},"0.0274%",[277,572,573],{},"105.38",[254,575,576,579,582,585,588],{},[277,577,578],{},"Kannada",[277,580,581],{},"64",[277,583,584],{},"0.80%",[277,586,587],{},"0.0122%",[277,589,590],{},"65.57",[254,592,593,596,599,602,605],{},[277,594,595],{},"Telugu",[277,597,598],{},"95",[277,600,601],{},"1.19%",[277,603,604],{},"0.0183%",[277,606,607],{},"64.89",[254,609,610,613,616,619,622],{},[277,611,612],{},"Gujarati",[277,614,615],{},"62",[277,617,618],{},"0.78%",[277,620,621],{},"0.0126%",[277,623,624],{},"61.51",[254,626,627,630,633,636,639],{},[277,628,629],{},"Marathi",[277,631,632],{},"99",[277,634,635],{},"1.24%",[277,637,638],{},"0.0213%",[277,640,641],{},"58.10",[254,643,644,647,650,653,656],{},[277,645,646],{},"Bengali",[277,648,649],{},"272",[277,651,652],{},"3.40%",[277,654,655],{},"0.0930%",[277,657,658],{},"36.56",[254,660,661,666,671,676,681],{},[277,662,663],{},[664,665,294],"strong",{},[277,667,668],{},[664,669,670],{},"1452",[277,672,673],{},[664,674,675],{},"18.15%",[277,677,678],{},[664,679,680],{},"45.88%",[277,682,683],{},[664,684,685],{},"0.40",[93,687,689],{"id":688},"what-underrepresentation-does","What Underrepresentation Does",[98,691,692],{},"Given the dominance of English in the internet data, it's not surprising that general-purpose models work much better for English than other languages, according to multiple studies.",[102,694,695,706],{},[105,696,699,700,705],{"icon":697,"title":698},"i-lucide-clipboard-check","MMLU","On the MMLU benchmark, a suite of 14,000 multiple-choice problems spanning 57 subjects, ",[128,701,704],{"href":702,"rel":703},"https:\u002F\u002Fopenai.com\u002Findex\u002Fgpt-4-research\u002F",[132],"GPT-4 performed much better in English"," than under-represented languages like Telugu, as shown in Figure 2-1 (OpenAI, 2023).",[105,707,710,711],{"icon":708,"title":709},"i-lucide-calculator","Project Euler","When tested on six math problems on Project Euler, Yennie Jun found that GPT-4 was able to solve problems in English more than three times as often compared to Armenian or Farsi.",[506,712,713],{},[128,714,718],{"href":715,"ariaDescribedBy":716,"dataFootnoteRef":88,"id":717},"#user-content-fn-2",[512],"user-content-fnref-2","2",[98,720,721],{},[722,723],"img",{"alt":724,"src":725},"Figure 2-1. On the MMLU benchmark, GPT-4 performs better in English than in any other language",".\u002Fmedia\u002Ffig-2-1.png",[242,727,728],{},[98,729,730],{},"Figure 2-1. On the MMLU benchmark, GPT-4 performs better in English than in any other language. To obtain MMLU in other languages, OpenAI translated the questions using Azure AI Translator.",[98,732,733],{},"GPT-4 failed in all six questions for Burmese and Amharic, as shown in Figure 2-2.",[98,735,736],{},[722,737],{"alt":738,"src":739},"Figure 2-2. GPT-4 is much better at math in English than in other languages.",".\u002Fmedia\u002Ffig-2-2.png",[242,741,742],{},[98,743,738],{},[165,745,746],{},"Under-representation is a big reason for this underperformance. The three languages that have the worst performance on GPT-4's MMLU benchmarks -- Telugu, Marathi, and Punjabi -- are also among the languages that are most under-represented in Common Crawl.",[98,748,749],{},"However, under-representation isn't the only reason. A language's structure and the culture it embodies can also make a language harder for a model to learn.",[93,751,753],{"id":752},"why-translation-is-not-enough","Why Translation Is Not Enough",[98,755,756],{},"Given that LLMs are generally good at translation, can we just translate all queries from other languages into English, obtain the responses, and translate them back into the original language? Many people indeed follow this approach, but it's not ideal.",[102,758,759,764],{},[105,760,763],{"icon":761,"title":762},"i-lucide-languages","Translation Requires Understanding","This requires a model that can sufficiently understand under-represented languages to translate.",[105,765,768,769,194,772,775],{"icon":766,"title":767},"i-lucide-file-x","Translation Can Lose Information","Some languages, like Vietnamese, have pronouns to denote the relationship between the two speakers. When translating into English, all these pronouns are translated into ",[154,770,771],{},"I",[154,773,774],{},"you",", causing the loss of the relationship information.",[98,777,778,779,784],{},"Models can also have unexpected performance challenges in non-English languages. For example, ",[128,780,783],{"href":781,"rel":782},"https:\u002F\u002Fwww.newsguardtech.com\u002Fspecial-reports\u002Fchatgpt-generates-disinformation-chinese-vs-english\u002F",[132],"NewsGuard"," found that ChatGPT is more willing to produce misinformation in Chinese than in English.",[142,786,787,788],{},"In April 2023, NewsGuard asked ChatGPT-3.5 to produce misinformation articles about China in English, simplified Chinese, and traditional Chinese. For English, ChatGPT declined to produce false claims for six out of seven prompts. However, it produced false claims in simplified Chinese and traditional Chinese all seven times. It's unclear what causes this difference in behavior.",[506,789,790],{},[128,791,795],{"href":792,"ariaDescribedBy":793,"dataFootnoteRef":88,"id":794},"#user-content-fn-3",[512],"user-content-fnref-3","3",[93,797,799],{"id":798},"tokenization-latency-and-cost","Tokenization, Latency, and Cost",[98,801,802],{},"Other than quality issues, models can also be slower and more expensive for non-English languages. A model's inference latency and cost is proportional to the number of tokens in the input and response. It turns out that tokenization can be much more efficient for some languages than others.",[98,804,805,806,811],{},"Benchmarking GPT-4 on MASSIVE, a dataset of one million short texts translated across 52 languages, Yennie Jun found that, to convey the same meaning, languages like Burmese and Hindi require ",[128,807,810],{"href":808,"rel":809},"https:\u002F\u002Fwww.artfish.ai\u002Fp\u002Fall-languages-are-not-created-tokenized",[132],"a lot more tokens"," than English or Spanish.",[102,813,814,821,828],{},[105,815,817,818,140],{"icon":816,"title":294},"i-lucide-gauge","For the MASSIVE dataset, the median token length in English is ",[664,819,820],{},"7",[105,822,824,825,140],{"icon":816,"title":823},"Hindi","The median token length in Hindi is ",[664,826,827],{},"32",[105,829,832,833,836],{"icon":830,"title":831},"i-lucide-gauge-circle","Burmese","The median token length in Burmese is ",[664,834,835],{},"72",", which is ten times longer than in English.",[98,838,839],{},"Assuming that the time it takes to generate a token is the same in all languages, GPT-4 takes approximately ten times longer in Burmese than in English for the same content. For APIs that charge by token usage, Burmese costs ten times more than English.",[93,841,843],{"id":842},"language-specific-models","Language-Specific Models",[98,845,846,847,852,853,852,858,863],{},"To address this, many models have been trained to focus on non-English languages. The most active language, other than English, is undoubtedly Chinese, with ",[128,848,851],{"href":849,"rel":850},"https:\u002F\u002Fgithub.com\u002FTHUDM\u002FChatGLM2-6B",[132],"ChatGLM",", ",[128,854,857],{"href":855,"rel":856},"https:\u002F\u002Fgithub.com\u002Fwenge-research\u002FYAYI",[132],"YAYI",[128,859,862],{"href":860,"rel":861},"https:\u002F\u002Fgithub.com\u002FLlamaFamily\u002FLlama-Chinese",[132],"Llama-Chinese",", and others.",[98,865,866,867,872,873,878,879,884],{},"There are also models in French (",[128,868,871],{"href":869,"rel":870},"https:\u002F\u002Fhuggingface.co\u002Fcroissantllm",[132],"CroissantLLM","), Vietnamese (",[128,874,877],{"href":875,"rel":876},"https:\u002F\u002Fgithub.com\u002FVinAIResearch\u002FPhoGPT",[132],"PhoGPT","), Arabic (",[128,880,883],{"href":881,"rel":882},"https:\u002F\u002Fhuggingface.co\u002Fcollections\u002Finceptionai\u002Fjais-family",[132],"Jais","), and many more languages.",[93,886,888],{"id":887},"domain-specific-models","Domain-Specific Models",[98,890,891,892,852,897,902,903,908],{},"General-purpose models like ",[128,893,896],{"href":894,"rel":895},"https:\u002F\u002Fstorage.googleapis.com\u002Fdeepmind-media\u002Fgemini\u002Fgemini_1_report.pdf",[132],"Gemini",[128,898,901],{"href":899,"rel":900},"https:\u002F\u002Fopenai.com\u002Findex\u002Fhello-gpt-4o\u002F",[132],"GPTs",", and ",[128,904,907],{"href":905,"rel":906},"https:\u002F\u002Fwww.llama.com\u002F",[132],"Llamas"," can perform incredibly well on a wide range of domains, including but not limited to coding, law, science, business, sports, and environmental science. This is largely thanks to the inclusion of these domains in their training data.",[98,910,911,912,914,915],{},"Figure 2-3 shows the distribution of domains present in Common Crawl according to the ",[154,913,156],{},"'s 2023 analysis.",[506,916,917],{},[128,918,922],{"href":919,"ariaDescribedBy":920,"dataFootnoteRef":88,"id":921},"#user-content-fn-4",[512],"user-content-fnref-4","4",[98,924,925],{},[722,926],{"alt":927,"src":928},"Figure 2-3. Distribution of domains in the C4 dataset. Reproduced from the statistics",".\u002Fmedia\u002Ffig-2-3.png",[242,930,931],{},[98,932,933],{},"Figure 2-3. Distribution of domains in the C4 dataset. Reproduced from the statistics from the Washington Post. One caveat of this analysis is that it only shows the categories that are included, not the categories missing.",[98,935,936,937,945],{},"As of this writing, there haven't been many analyses of domain distribution in vision data. This might be because images are harder to categorize than texts.",[506,938,939],{},[128,940,944],{"href":941,"ariaDescribedBy":942,"dataFootnoteRef":88,"id":943},"#user-content-fn-5",[512],"user-content-fnref-5","5"," However, you can infer a model's domains from its benchmark performance.",[98,947,948,949,954],{},"Table 2-3 shows how two models, CLIP and Open CLIP, ",[128,950,953],{"href":951,"rel":952},"https:\u002F\u002Flaion.ai\u002Fblog\u002Flaion-5b\u002F",[132],"perform on different benchmarks",". These benchmarks show how well these two models do on birds, flowers, cars, and a few more categories, but the world is so much bigger and more complex than these few categories.",[242,956,957],{},[98,958,959],{},"Table 2-3. Open CLIP and CLIP's performance on different image datasets.",[248,961,962,982],{},[251,963,964],{},[254,965,966,969,976],{},[257,967,968],{},"Dataset",[257,970,971,972,975],{},"CLIP ",[973,974],"br",{},"Accuracy of ViT-B\u002F32 (OpenAI)",[257,977,978,979,981],{},"Open CLIP ",[973,980],{},"Accuracy of ViT-B\u002F32 (Cade)",[272,983,984,995,1006,1017,1028,1039,1050,1061],{},[254,985,986,989,992],{},[277,987,988],{},"ImageNet",[277,990,991],{},"63.2",[277,993,994],{},"62.9",[254,996,997,1000,1003],{},[277,998,999],{},"ImageNet v2",[277,1001,1002],{},"-",[277,1004,1005],{},"62.6",[254,1007,1008,1011,1014],{},[277,1009,1010],{},"Birdsnap",[277,1012,1013],{},"37.8",[277,1015,1016],{},"46.0",[254,1018,1019,1022,1025],{},[277,1020,1021],{},"Country211",[277,1023,1024],{},"17.8",[277,1026,1027],{},"14.8",[254,1029,1030,1033,1036],{},[277,1031,1032],{},"Oxford 102 Category Flower",[277,1034,1035],{},"66.7",[277,1037,1038],{},"66.0",[254,1040,1041,1044,1047],{},[277,1042,1043],{},"German Traffic Sign Recognition Benchmark",[277,1045,1046],{},"32.2",[277,1048,1049],{},"42.0",[254,1051,1052,1055,1058],{},[277,1053,1054],{},"Stanford Cars",[277,1056,1057],{},"59.4",[277,1059,1060],{},"79.3",[254,1062,1063,1066,1069],{},[277,1064,1065],{},"UCF101",[277,1067,1068],{},"64.5",[277,1070,1071],{},"63.1",[93,1073,1075],{"id":1074},"when-general-purpose-data-is-not-enough","When General-Purpose Data Is Not Enough",[98,1077,1078],{},"Even though general-purpose foundation models can answer everyday questions about different domains, they are unlikely to perform well on domain-specific tasks, especially if they never saw these tasks during training.",[102,1080,1081,1086],{},[105,1082,1085],{"icon":1083,"title":1084},"i-lucide-dna","Drug Discovery","Drug discovery involves protein, DNA, and RNA data, which follow specific formats and are expensive to acquire. This data is unlikely to be found in publicly available internet data.",[105,1087,1090],{"icon":1088,"title":1089},"i-lucide-scan-search","Cancer Screening","Cancer screening typically involves X-ray and fMRI (functional magnetic resonance imaging) scans, which are hard to obtain due to privacy.",[98,1092,1093],{},"To train a model to perform well on these domain-specific tasks, you might need to curate very specific datasets.",[102,1095,1096,1110,1120],{},[105,1097,1100,1101,1106,1107,140],{"icon":1098,"title":1099},"i-lucide-folders","AlphaFold","One of the most famous domain-specific models is perhaps ",[128,1102,1105],{"href":1103,"rel":1104},"https:\u002F\u002Fdeepmind.google\u002Fscience\u002Falphafold\u002F",[132],"DeepMind's AlphaFold",", trained on the sequences and 3D structures of around ",[664,1108,1109],{},"100,000 known proteins",[105,1111,1114,1119],{"icon":1112,"title":1113},"i-lucide-flask-conical","BioNeMo",[128,1115,1118],{"href":1116,"rel":1117},"https:\u002F\u002Fblogs.nvidia.com\u002Fblog\u002Fbionemo-large-language-models-drug-discovery\u002F",[132],"NVIDIA's BioNeMo"," focuses on biomolecular data for drug discovery.",[105,1121,1124,1129],{"icon":1122,"title":1123},"i-lucide-stethoscope","Med-PaLM2",[128,1125,1128],{"href":1126,"rel":1127},"https:\u002F\u002Fcloud.google.com\u002Fblog\u002Ftopics\u002Fhealthcare-life-sciences\u002Fsharing-google-med-palm-2-medical-large-language-model",[132],"Google's Med-PaLM2"," combined the power of an LLM with medical data to answer medical queries with higher accuracy.",[185,1131,1132],{},"Domain-specific models are especially common for biomedicine, but other fields can benefit from domain-specific models too. It's possible that a model trained on architectural sketches can help architects much better than Stable Diffusion, or a model trained on factory plans can be optimized for manufacturing processes much better than a generic model like ChatGPT.",[98,1134,1135],{},"This section gave a high-level overview of how training data impacts a model's performance. Next, let's explore the impact of how a model is designed on its performance.",[1137,1138,1141,1146],"section",{"className":1139,"dataFootnotes":88},[1140],"footnotes",[93,1142,1145],{"className":1143,"id":512},[1144],"sr-only","Footnotes",[1147,1148,1149,1161,1181,1190,1203],"ol",{},[1150,1151,1153,1154],"li",{"id":1152},"user-content-fn-1","A world population of eight billion was used for this calculation. ",[128,1155,1160],{"href":1156,"ariaLabel":1157,"className":1158,"dataFootnoteBackref":88},"#user-content-fnref-1","Back to reference 1",[1159],"data-footnote-backref","↩",[1150,1162,1164,1169,1170,1175,1176],{"id":1163},"user-content-fn-2",[128,1165,1168],{"href":1166,"rel":1167},"https:\u002F\u002Fwww.artfish.ai\u002Fp\u002Fgpt4-project-euler-many-languages?hide_intro_popup=true",[132],"\"GPT-4 Can Solve Math Problems--but Not in All Languages\""," by Yennie Jun. You can verify the study using ",[128,1171,1174],{"href":1172,"rel":1173},"https:\u002F\u002Fplatform.openai.com\u002Ftokenizer",[132],"OpenAI's Tokenizer",". ",[128,1177,1160],{"href":1178,"ariaLabel":1179,"className":1180,"dataFootnoteBackref":88},"#user-content-fnref-2","Back to reference 2",[1159],[1150,1182,1184,1185],{"id":1183},"user-content-fn-3","It might be because of some biases in pre-training data or alignment data. Perhaps OpenAI just didn't include as much data in the Chinese language or China-centric narratives to train their models. ",[128,1186,1160],{"href":1187,"ariaLabel":1188,"className":1189,"dataFootnoteBackref":88},"#user-content-fnref-3","Back to reference 3",[1159],[1150,1191,1193,1197,1198],{"id":1192},"user-content-fn-4",[128,1194,1196],{"href":150,"rel":1195},[132],"\"Inside the Secret List of Websites That Make AI like ChatGPT Sound Smart\"",", Washington Post, 2023. ",[128,1199,1160],{"href":1200,"ariaLabel":1201,"className":1202,"dataFootnoteBackref":88},"#user-content-fnref-4","Back to reference 4",[1159],[1150,1204,1206,1207],{"id":1205},"user-content-fn-5","For texts, you can use domain keywords as heuristics, but there are no obvious heuristics for images. Most analyses I could find about vision datasets are about image sizes, resolutions, or video lengths. ",[128,1208,1160],{"href":1209,"ariaLabel":1210,"className":1211,"dataFootnoteBackref":88},"#user-content-fnref-5","Back to reference 5",[1159],{"title":88,"searchDepth":1213,"depth":1213,"links":1214},2,[1215,1216,1217,1218,1219,1220,1221,1222,1223,1224,1225],{"id":95,"depth":1213,"text":96},{"id":122,"depth":1213,"text":123},{"id":179,"depth":1213,"text":180},{"id":227,"depth":1213,"text":228},{"id":688,"depth":1213,"text":689},{"id":752,"depth":1213,"text":753},{"id":798,"depth":1213,"text":799},{"id":842,"depth":1213,"text":843},{"id":887,"depth":1213,"text":888},{"id":1074,"depth":1213,"text":1075},{"id":512,"depth":1213,"text":1145},"How training data quality, language coverage, and domain coverage shape foundation model capability, cost, and reliability.","md",{},{"icon":58},{"title":55,"description":1226},"uFHtc1kZ_q_jFval9Trj3TTnWMUJDW9id6aiVe5GXpg",[1233,1235],{"title":47,"path":52,"stem":53,"description":1234,"icon":12,"children":-1},"A guide to how training data, architecture, size, post-training, and sampling shape foundation model behavior.",{"title":60,"path":61,"stem":62,"description":1236,"icon":63,"children":-1},"How architecture, attention, model size, scaling laws, and bottlenecks shape foundation model capability and usability.",1779363442105]