In [1]:
import io

def load_vectors(file_name, size, exceptions=[], language='en'):
    fin = io.open(file_name, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    count = 0
    for line in fin:
        tokens = line.rstrip().split(' ')
        word = tokens[0]
        if good_word(word, language) and not word.lower() in exceptions:
            word = word.lower()
            data[word] = map(float, tokens[1:])
            count = count+1
            exceptions.append(word) # may occur with different capitalisation
        if count >= size:
            return data
    return data
def good_word(string, language='en') :
    # isalpha doesn't work on Hindi, Chinese, Japanese, and Sinhalese
    if language == 'hi':
        # see https://en.wikipedia.org/wiki/Devanagari_(Unicode_block)
        for character in string:
            if (character < u'\u0900' or character > u'\u097f' or character == u'\200d'):
                # u'\200d' is really a space - https://en.wikipedia.org/wiki/Zero-width_non-joiner
                return False
        return True
    elif language == 'si':
        # see https://en.wikipedia.org/wiki/Sinhala_(Unicode_block)
        for character in string:
            if (character < u'\u0d80' or character > u'\u0dff' or character == u'\200d'):
                # u'\200d' is really a space - https://en.wikipedia.org/wiki/Zero-width_non-joiner
                return False
        return True
    elif language == 'zh':
        # see https://en.wikipedia.org/wiki/CJK_Unified_Ideographs#CJK_Unified_Ideographs
        for character in string:
            if (character < u'\u4e00' or character > u'\u9fff'):
                return False
        return True
    elif language == 'ja':
        # see https://en.wikipedia.org/wiki/CJK_Unified_Ideographs#CJK_Unified_Ideographs
        # and https://en.wikipedia.org/wiki/Katakana_(Unicode_block)
        # and https://en.wikipedia.org/wiki/Hiragana_(Unicode_block)
        for character in string:
            if ((character < u'\u4e00' and (character < u'\u3040' or character > u'\u30ff')) or character > u'\u9fff'):
                return False
            if character in '・':
                # punctuation
                return False
        return True
    elif language == 'de':
        # German nouns are capitalised not just proper nouns
        return string.isalpha() and string[1:].islower(); # OK if first letter is capitalised
    elif not string.isalpha() or not string.islower():
        return False
    return True
def write_js(input_name, output_name, language, size, exceptions=[]):
    data = load_vectors(input_name, size, exceptions, language)
    out = io.open(output_name, 'w', encoding='utf-16', newline='\n')
    print('if (words_to_features === undefined) word_to_features = {};', file=out)
    print('words_to_features["' + language + '"] = {', file=out)
    for key, vector in data.items():
        print('"' + key + '"', end=':[', file=out)
        for x in vector:
            print(str(x), end=',', file=out)
        print('],', file=out)
    print('}', file=out)
    out.close()

def write_vectors(input_name, output_name, language, size, exceptions=[]):
    data = load_vectors(input_name, size, exceptions, language)
    out = io.open(output_name, 'w', encoding='utf-8', newline='\n')
    print('20000 300', end='\n', file=out)
    for key, vector in data.items():
        print(key, end=' ', file=out)
        for x in vector:
            print(str(x), end=' ', file=out)
        print('', end='\n', file=out)
    out.close()


In [61]:
write_js('c:/temp/wiki.id.vec', 'C:/Users/Ken/Documents/GitHub/ai/word-embeddings/id/wiki-words.js', 'id', 20000)

'Finished'

In [6]:
write_js('c:/temp/cc.hi.300.vec', 'C:/Users/Ken/Documents/GitHub/ai/word-embeddings/hi/wiki-words.js', 'hi', 20000)

In [2]:
write_js('c:/temp/cc.sv.300.vec', 'C:/Users/Ken/Documents/GitHub/ai/word-embeddings/sv/wiki-words.js', 'sv', 20000)

In [100]:
write_js('c:/temp/cc.fi.300.vec', 'C:/Users/Ken/Documents/GitHub/ai/word-embeddings/fi/wiki-words.js', 'fi', 20000)

'Finished'

In [4]:
write_js('c:/temp/cc.el.300.vec', 'C:/Users/Ken/Documents/GitHub/ai/word-embeddings/el/wiki-words.js', 'el', 20000)

In [5]:
write_js('c:/temp/cc.it.300.vec', 'C:/Users/Ken/Documents/GitHub/ai/word-embeddings/it/wiki-words.js', 'it', 20000)

In [7]:
write_js('c:/temp/cc.pt.300.vec', 'C:/Users/Ken/Documents/GitHub/ai/word-embeddings/pt/wiki-words.js', 'pt', 20000)

In [7]:
write_js('c:/temp/cc.si.300.vec', 'C:/Users/Ken/Documents/GitHub/ai/word-embeddings/si/wiki-words.js', 'si', 20000)

In [10]:
write_js('c:/temp/cc.fr.300.vec', 'C:/Users/Ken/Documents/GitHub/ai/word-embeddings/fr/wiki-words.js', 'fr', 20000)

In [13]:
write_js('c:/temp/cc.de.300.vec', 'C:/Users/Ken/Documents/GitHub/ai/word-embeddings/de/wiki-words.js', 'de', 20000)

In [3]:
write_js('c:/temp/cc.zh.300.vec', 'C:/Users/Ken/Documents/GitHub/ai/word-embeddings/zh/wiki-words.js', 'zh', 20000)

In [13]:
write_js('c:/temp/cc.sv.300.vec', 'C:/Users/Ken/Documents/GitHub/ai/word-embeddings/sv/wiki-words.js', 'sv', 20000)

In [19]:
write_js('c:/temp/cc.ja.300.vec', 'C:/Users/Ken/Documents/GitHub/ai/word-embeddings/ja/wiki-words.js', 'ja', 20000)

In [2]:
write_vectors('c://temp//wiki-news-300d-1M.vec', 'C://Users//Ken//Documents//GitHub//ai//word-embeddings//en/wiki-words.vec', 'en', 20000)

In [12]:
write_vectors('c://temp//cc.it.300.vec', 'C://Users//Ken//Documents//GitHub//ai//word-embeddings//it/wiki-words.vec', 'it', 20000)

In [3]:
write_vectors('c://temp//cc.sv.300.vec', 'C://Users//Ken//Documents//GitHub//ai//word-embeddings//sv/wiki-words.vec', 'sv', 20000)

In [20]:
write_vectors('c://temp//cc.ja.300.vec', 'C://Users//Ken//Documents//GitHub//ai//word-embeddings//ja/wiki-words.vec', 'ja', 20000)

In [14]:
write_vectors('c://temp//cc.de.300.vec', 'C://Users//Ken//Documents//GitHub//ai//word-embeddings//de/wiki-words.vec', 'de', 20000)

In [6]:
write_vectors('c://temp//cc.el.300.vec', 'C://Users//Ken//Documents//GitHub//ai//word-embeddings//el/wiki-words.vec', 'el', 20000)

In [7]:
write_vectors('c://temp//cc.fr.300.vec', 'C://Users//Ken//Documents//GitHub//ai//word-embeddings//fr/wiki-words.vec', 'fr', 20000)

In [8]:
write_vectors('c://temp//cc.fi.300.vec', 'C://Users//Ken//Documents//GitHub//ai//word-embeddings//fi/wiki-words.vec', 'fi', 20000)

In [14]:
write_vectors('c://temp//cc.id.300.vec', 'C://Users//Ken//Documents//GitHub//ai//word-embeddings//id/wiki-words.vec', 'id', 20000)

In [10]:
write_vectors('c://temp//cc.pt.300.vec', 'C://Users//Ken//Documents//GitHub//ai//word-embeddings//pt/wiki-words.vec', 'pt', 20000)

In [8]:
write_vectors('c://temp//cc.si.300.vec', 'C://Users//Ken//Documents//GitHub//ai//word-embeddings//si/wiki-words.vec', 'si', 20000)

In [4]:
write_vectors('c://temp//cc.zh.300.vec', 'C://Users//Ken//Documents//GitHub//ai//word-embeddings//zh/wiki-words.vec', 'zh', 20000)

In [5]:
write_vectors('c://temp//cc.hi.300.vec', 'C://Users//Ken//Documents//GitHub//ai//word-embeddings//hi/wiki-words.vec', 'hi', 20000)

In [15]:
write_vectors('c://temp//cc.es.300.vec', 'C://Users//Ken//Documents//GitHub//ai//word-embeddings//es/wiki-words.vec', 'es', 20000)

In [16]:
write_js('c:/temp/cc.es.300.vec', 'C:/Users/Ken/Documents/GitHub/ai/word-embeddings/es/wiki-words.js', 'es', 20000)

In [3]:
write_js('c:/temp/cc.id.300.vec', 'C:/Users/Ken/Documents/GitHub/ai/word-embeddings/id/wiki-words.js', 'id', 20000)

In [5]:
write_js('c:/temp/cc.it.300.vec', 'C:/Users/Ken/Documents/GitHub/ai/word-embeddings/it/wiki-words.js', 'it', 20000)

In [5]:
write_js('c:/temp/cc.lt.300.vec', 'C:/Users/Ken/Documents/GitHub/ai/word-embeddings/lt/wiki-words.js', 'lt', 20000)

In [6]:
write_vectors('c://temp//cc.lt.300.vec', 'C://Users//Ken//Documents//GitHub//ai//word-embeddings//lt/wiki-words.vec', 'lt', 20000)

In [2]:
good_word('在', 'zh')

True