from fastai.basics import *
from fastai.callback.all import *
from fastai.text.all import *
Data
We’ll start by using untar_data()
to download and uncompress our data:
path = untar_data(URLs.WIKITEXT_TINY)
The dataset comes with two csv files, we’ll read and combine them into one dataframe:
df_train = pd.read_csv(path/'train.csv', header=None)
df_valid = pd.read_csv(path/'test.csv', header=None)
df_all = pd.concat([df_train, df_valid])
df_all.head()
We can tokenize it based on spaces to compare, but instead, we’ll use the standard fastai tokenizer:
splits = [list(range_of(df_train)), list(range(len(df_train), len(df_all)))]
tfms = [attrgetter("text"), Tokenizer.from_df(0), Numericalize()]
dsets = Datasets(df_all, [tfms], splits=splits, dl_type=LMDataLoader)
bs,sl = 104,72
dls = dsets.dataloaders(bs=bs, seq_len=sl)
dls.show_batch(max_n=3)