# wget http://nlp.stanford.edu/data/glove.6B.zip import numpy as np import re import spacy from spacy.lang.en import English from zipfile import ZipFile vocabulary = set() with ZipFile("glove.6B.zip", "r") as archive: with archive.open("glove.6B.300d.txt", "r") as file: for line in file: token = line.decode("utf-8").split(" ")[0] vocabulary.add(token) nlp = English() tokenizer = nlp.tokenizer # create token-to-file mapping mapping = dict() with ZipFile("ml530-2022-fall-newsgroups.zip", "r") as archive: index = 0 for i in range(15062): with archive.open("newsgroups_trn/newsgroups_trn_" + str(i).zfill(5) + ".txt") as file: text = file.read().decode("utf-8") text = re.sub("[\t\r\n]", " ", text) text = " ".join(text.split()) for token in [ token.text for token in tokenizer(text) ]: key = token.lower() if (key not in mapping): mapping[key] = set() if (i not in mapping[key]): mapping[key].add(i) index = index + 1 # create frequency-to-token mapping members = dict() for k,v in mapping.items(): freq = len(v) if (freq not in members): members[freq] = [] if (k not in members[freq]): members[freq].append(k) # sort tokens by frequency, in descending order tuples = sorted(members.items(), reverse = True) # select tokens for the vocabulary selected = [] for i in range(len(tuples)): candidates = tuples[i][1] np.random.shuffle(candidates) for candidate in candidates: if (candidate in vocabulary): selected.append(candidate) # write out vocabulary output = open("vocabulary.dat", "w") for token in selected: n = len(mapping[token]) if (n >= 10): output.write(str(len(mapping[token])) + "\t" + token + "\n") output.close()