add indexed files
This commit is contained in:
parent
cbbb8148b5
commit
bf1fa8e3dd
7 changed files with 250 additions and 190 deletions
88
tokenizer.py
88
tokenizer.py
|
@ -1,53 +1,49 @@
|
|||
#coding:utf-8
|
||||
# coding:utf-8
|
||||
import piexel
|
||||
import re
|
||||
|
||||
|
||||
class Tokenizer:
|
||||
def __init__(self, conf, api):
|
||||
self.conf = conf
|
||||
self.reload_tokens(api)
|
||||
|
||||
def reload_tokens(self, api):
|
||||
"""
|
||||
Charge les tokens depuis l'API,
|
||||
et initialise la liste des étapes
|
||||
"""
|
||||
self.tk = api.get_tokens()
|
||||
self.steps = list(set(t['step'] for t in self.tk))
|
||||
self.steps.sort()
|
||||
|
||||
def get_tokens_step(self, step):
|
||||
"""
|
||||
Retourne tout les tokens de l'étape <step>
|
||||
"""
|
||||
return [t for t in self.tk if t['step'] == step]
|
||||
|
||||
def tokenize(self, filename):
|
||||
"""
|
||||
Analyse <filename> pour trouver tous ses marqueurs.
|
||||
Les marqueurs sont enlevés, et rangés dans des listes.
|
||||
retourne le nom privé des marqueurs, ainsi que le dictionnaire des marqueurs
|
||||
"""
|
||||
found = {'lang':[], 'quality':[], 'subtitle':[]}
|
||||
for step in self.steps:
|
||||
for tok in self.get_tokens_step(step):
|
||||
if(not bool(int(tok['case_sensitive']))):
|
||||
reg = re.compile(r' '+tok['token']+r' ', re.IGNORECASE)
|
||||
else:
|
||||
reg = re.compile(r' '+tok['token']+r' ')
|
||||
if reg.search(filename):
|
||||
for tok_lang in tok['languages']:
|
||||
found['lang'].append(tok_lang['value'])
|
||||
for tok_qual in tok['qualities']:
|
||||
found['quality'].append(tok_qual['value'])
|
||||
for tok_sub in tok['subtitle_languages']:
|
||||
found['subtitle'].append(tok_sub['value'])
|
||||
filename = reg.sub(' ', filename)
|
||||
for typ in found:
|
||||
found[typ] = [e for e in found[typ] if e != 'N/A']
|
||||
return filename, found
|
||||
|
||||
|
||||
def __init__(self, conf, api):
|
||||
self.conf = conf
|
||||
self.reload_tokens(api)
|
||||
|
||||
def reload_tokens(self, api):
|
||||
"""
|
||||
Charge les tokens depuis l'API,
|
||||
et initialise la liste des étapes
|
||||
"""
|
||||
self.tk = api.get_tokens()
|
||||
self.steps = list(set(t['step'] for t in self.tk))
|
||||
self.steps.sort()
|
||||
|
||||
def get_tokens_step(self, step):
|
||||
"""
|
||||
Retourne tout les tokens de l'étape <step>
|
||||
"""
|
||||
return [t for t in self.tk if t['step'] == step]
|
||||
|
||||
def tokenize(self, filename):
|
||||
"""
|
||||
Analyse <filename> pour trouver tous ses marqueurs.
|
||||
Les marqueurs sont enlevés, et rangés dans des listes.
|
||||
retourne le nom privé des marqueurs, ainsi que le dictionnaire des marqueurs
|
||||
"""
|
||||
found = {'lang': [], 'quality': [], 'subtitle': []}
|
||||
for step in self.steps:
|
||||
for tok in self.get_tokens_step(step):
|
||||
if (not bool(int(tok['case_sensitive']))):
|
||||
reg = re.compile(r' ' + tok['token'] + r' ', re.IGNORECASE)
|
||||
else:
|
||||
reg = re.compile(r' ' + tok['token'] + r' ')
|
||||
if reg.search(filename):
|
||||
for tok_lang in tok['languages']:
|
||||
found['lang'].append(tok_lang['value'])
|
||||
for tok_qual in tok['qualities']:
|
||||
found['quality'].append(tok_qual['value'])
|
||||
for tok_sub in tok['subtitle_languages']:
|
||||
found['subtitle'].append(tok_sub['value'])
|
||||
filename = reg.sub(' ', filename)
|
||||
for typ in found:
|
||||
found[typ] = [e for e in found[typ] if e != 'N/A']
|
||||
return filename, found
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue