-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapplication.py
505 lines (430 loc) · 18.1 KB
/
application.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
"""Article AI"""
import gc
import json
from typing import List, Dict, Any
import os
import re
import time
import threading
from enum import Enum
import logging
from logging.handlers import RotatingFileHandler
import warnings
import psutil
import numpy as np
from flask import Flask, redirect, render_template, send_from_directory, request, Response
from flask_limiter import Limiter
from flask_limiter.util import get_remote_address
import spacy
from spacy.tokens import Doc
import torch
from keybert import KeyBERT
from werkzeug.middleware.proxy_fix import ProxyFix
if not Doc.has_extension("text_id"):
Doc.set_extension("text_id", default=None)
DEFAULT_LIMIT = "100/second"
if os.environ.get('FLASK_LIMIT_DEFAULT') is not None:
DEFAULT_LIMIT = os.environ.get('FLASK_LIMIT_DEFAULT')
PROCESS_LIMIT = "10/second"
if os.environ.get('FLASK_LIMIT_PROCESS') is not None:
PROCESS_LIMIT = os.environ.get('FLASK_LIMIT_PROCESS')
LIMITER_STORAGE = "memory://"
if os.environ.get('FLASK_LIMITER_STORAGE') is not None:
LIMITER_STORAGE = os.environ.get('FLASK_LIMITER_STORAGE')
def limiter_key():
"""Limiter key function"""
return get_remote_address() + request.path
application = Flask(__name__)
application.wsgi_app = ProxyFix(application.wsgi_app, x_for=1, x_host=1)
application.secret_key = os.environ.get('FLASK_SECRET_KEY', '123')
application.config['TEMPLATES_AUTO_RELOAD'] = 1
application.config['SEND_FILE_MAX_AGE_DEFAULT'] = 0
limiter = Limiter(key_func=limiter_key, app=application,
default_limits=[DEFAULT_LIMIT],
storage_uri=LIMITER_STORAGE)
if os.environ.get('FLASK_LOG_FILE_PATH') is not None:
logging.basicConfig(
force=True,
handlers=[
RotatingFileHandler(
filename=os.environ.get('FLASK_LOG_FILE_PATH'),
maxBytes=1024 * 1024 * 10,
backupCount=10,
),
],
format='%(asctime)s %(levelname)s %(name)s %(threadName)s : %(message)s',
level=logging.INFO)
else:
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(levelname)s %(name)s %(threadName)s : %(message)s')
logging.getLogger("werkzeug").setLevel(logging.WARN)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.info("Configured flask limiter with default limit %s, process limit %s, storage %s",
DEFAULT_LIMIT, PROCESS_LIMIT, LIMITER_STORAGE)
logger.info("SpaCy version %s", spacy.about.__version__)
loading = threading.Semaphore()
@application.route("/")
def root():
"""Root page"""
host = request.headers['Host']
logger.info("Host: %s remote %s", host, request.remote_addr)
if re.match(r'^www\.', host):
return redirect("http://" + re.sub(r'^www\.', "", host), 301)
content = render_template("root.html")
webpage = render_template("index.html", content=content)
resp = Response(webpage)
resp.headers['Strict-Transport-Security'] = 'max-age=63072000' # 2 years
return resp
@application.route("/favicon.ico")
def favicon():
"""Redirect to proper favicon"""
return redirect("static/icon.svg", code=302)
@application.route('/robots.txt')
@application.route('/site.webmanifest')
def static_from_root():
"""Mapping for static files"""
return send_from_directory(application.static_folder, request.path[1:])
@application.route("/index.html")
def index():
"""Old root page"""
return redirect(".", 302)
class ModelName(str, Enum):
"""
Enum of the available models. This allows the API to raise a more specific
error if an invalid model is provided.
"""
# pylint: disable=fixme, invalid-name
en_core_web_sm = "en_core_web_sm"
en_core_web_md = "en_core_web_md"
en_core_web_lg = "en_core_web_lg"
en_core_web_trf = "en_core_web_trf"
xx_ent_wiki_sm = "xx_ent_wiki_sm"
fi_core_news_sm = "fi_core_news_sm"
fi_core_news_lg = "fi_core_news_lg"
sv_core_news_sm = "sv_core_news_sm"
sv_core_news_lg = "sv_core_news_lg"
MODELS = {}
KEYWORD_MODELS = {}
KEYWORDS_DEFAULT = 20
KEYWORD_MODEL_DEFAULT = "all-MiniLM-L6-v2"
# if CUDA_VISIBLE_DEVICES defined and not empty, use GPU
CUDA_VISIBLE_DEVICES = os.environ.get('CUDA_VISIBLE_DEVICES')
if CUDA_VISIBLE_DEVICES is not None and CUDA_VISIBLE_DEVICES != "":
logger.info("Using GPU")
spacy.require_gpu()
else:
logger.info("Using CPU")
# Use CPU and after that initialize spacytextblob
os.environ["CUDA_VISIBLE_DEVICES"] = ""
spacy.require_cpu()
# https://github.com/explosion/spaCy/discussions/10621
torch.set_num_threads(1) # nopep8, pylint: disable=no-member
# You gotta be kidding me https://github.com/ultralytics/yolov5/issues/6692
warnings.filterwarnings(
'ignore', message='User provided device_type of \'cuda\', but CUDA is not available. Disabling')
from spacytextblob.spacytextblob import SpacyTextBlob # nopep8, pylint: disable=unused-import,wrong-import-position
@application.route("/models")
def get_models() -> List[str]:
"""Return a list of all available loaded models."""
return json.dumps([model.value for model in ModelName])
def todict(obj, classkey=None, level=0):
"""
Converts object to dictionary so it can be written as JSON
"""
if level > 10:
logger.warning("todict recursion limit reached")
return None
if isinstance(obj, dict):
data = {}
for (key, value) in obj.items():
logger.info("Level items %s", key)
data[key] = todict(value, classkey, level + 1)
return data
elif hasattr(obj, "_ast"):
return todict(obj._ast(), None, level + 1) # pylint: disable=protected-access
elif hasattr(obj, "__iter__"):
return [todict(v, classkey, level + 1) for v in obj]
elif hasattr(obj, "__dict__"):
data = dict([(key, todict(value, classkey, level + 1))
for key, value in obj.__dict__.iteritems()
if not callable(value) and not key.startswith('_') and key not in ['name']])
if classkey is not None and hasattr(obj, "__class__"):
data[classkey] = obj.__class__.__name__
return data
elif isinstance(obj, (np.ndarray,)):
return obj.tolist()
elif isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
np.int16, np.int32, np.int64, np.uint8,
np.uint16, np.uint32, np.uint64)):
return int(obj)
elif isinstance(obj, (np.float_, np.float16, np.float32,
np.float64)):
return float(obj)
elif hasattr(obj, "__unicode__"):
return {'type': type(obj).__name__, 'text': obj.__unicode__()}
logger.info("Unknown %s", type(obj))
return {'type': type(obj).__name__}
def get_data(doc: Doc) -> Dict[str, Any]:
"""
Extract the data to return from the REST API given a Doc object. Modify
this function to include other data.
See more here: https://spacy.io/usage/linguistic-features
"""
entities = []
for ent in doc.ents:
entities.append(
{
"text": ent.text,
"label": ent.label_,
"start": ent.start_char,
"end": ent.end_char
})
return {
"text": doc.text,
"entities": entities,
"text_id": doc._.text_id
}
def get_keywords(doc, keyword_model, top_n=KEYWORDS_DEFAULT, lang: str = "en"):
"""
Keyword extraction using KeyBERT with SpaCy embeddings
https://github.com/MaartenGr/KeyBERT
https://maartengr.github.io/KeyBERT/guides/embeddings.html#hugging-face-transformers
https://www.sbert.net/docs/pretrained_models.html
https://github.com/MartinoMensio/spacy-sentence-bert
"""
if keyword_model is None:
keyword_model = KEYWORD_MODEL_DEFAULT
if keyword_model not in KEYWORD_MODELS:
loading.acquire()
start = time.time_ns()
logger.info("Loading keyword model %s", keyword_model)
KEYWORD_MODELS[keyword_model] = KeyBERT(keyword_model)
end = time.time_ns()
logger.info("Loaded keyword model %s in %s ms",
keyword_model, (end - start) / 1000000)
loading.release()
kw_model = KEYWORD_MODELS.get(keyword_model)
# Generates [[keyword, score]]
keywords_scored = kw_model.extract_keywords(doc.text,
keyphrase_ngram_range=(1, 1),
stop_words=None,
top_n=top_n)
keywords = []
for keyword, score in keywords_scored: # pylint: disable=unused-variable
keywords.append(keyword)
return keywords
class PythonObjectEncoder(json.JSONEncoder):
"""
JSON encoder for Python objects
"""
def default(self, o):
if isinstance(o, (list, dict, str, int, float, bool, type(None))):
return json.JSONEncoder.default(self, o)
return todict(o)
def load_model(model: str, language="en"):
"""
Load a model or return it if it's already loaded.
"""
loading.acquire()
# Check model exists in ModelName class
if model not in ModelName.__members__:
raise ValueError(f"Unknown model: {model}")
if language is None:
language = "en"
if MODELS.get(model) is None:
start = time.time_ns()
logger.info("Loading model %s", model)
nlp = spacy.load(model)
nlp.add_pipe("spacytextblob")
MODELS[model] = nlp
end = time.time_ns()
logger.info("Loaded model in %i ms", (end - start) / 1000000)
loading.release()
return MODELS[model]
def process_texts(texts, model, keyword_model=KEYWORD_MODEL_DEFAULT,
top_n=KEYWORDS_DEFAULT, lang="en", as_tuples=False):
"""
Process a batch of articles and return the entities predicted by the
given model.
"""
nlp: spacy.language.Language = load_model(model, lang)
response = []
docs = []
previous = None
start = time.time_ns()
ids = []
disable = []
# spacytextblob suppots only English
if lang != "en":
disable.append("spacytextblob")
if as_tuples:
for doc, context in nlp.pipe(texts, as_tuples=True, disable=disable):
doc._.text_id = context["text_id"]
ids.append(context["text_id"])
docs.append(doc)
else:
for doc in nlp.pipe(texts, as_tuples=False, disable=disable):
docs.append(doc)
end = time.time_ns()
logger.info("Done analytics for %s in %s ms", ids, (end - start) / 1000000)
for doc in docs:
doc_data = get_data(doc)
# Runtime information
doc_data["model"] = model
doc_data["language"] = lang
doc_data["spacy_version"] = spacy.__version__
doc_data["spacy_extensions"] = [
s for s in list(nlp.pipe_names) if s not in disable]
# Similarity to previous text
if previous is not None:
doc_data["similarity"] = doc.similarity(previous)
if doc.has_extension("blob") and doc._.blob is not None:
doc_data["polarity"] = doc._.blob.polarity
doc_data["subjectivity"] = doc._.blob.subjectivity
doc_data["assessments"] = doc._.blob.sentiment_assessments.assessments
doc_data["ngrams"] = doc._.blob.ngrams()
doc_data["word_counts"] = doc._.blob.word_counts
start = time.time_ns()
doc_data["keywords"] = get_keywords(doc, keyword_model, top_n, lang)
doc_data["spacy_extensions"].append("keybert")
end = time.time_ns()
logger.info("Done keyword extraction for %s in %s ms",
doc._.text_id, (end - start) / 1000000)
response.append(doc_data)
previous = doc
return response
@application.route("/test")
def test():
"""
Internally tests API on predefined text
"""
texts = [
"""
Supply chain disruptions — triggered by factors including demand surges,
high transportation costs and pandemic-related lockdowns — are expected to
continue well into next year, experts predict. Companies are experiencing
the brunt of the impact, with 36% of small businesses responding to a 2021
U.S. Census survey reporting that they’ve experienced delays with domestic
suppliers. This has been costly. According to a 2020 Statista survey, 41% of
executives in the automotive and transportation industry alone said their
company lost $50 to $100 million due to supply chain issues, a figure which
has likely climbed higher since.
There’s no easy fix, but an emerging cohort of startups is pitching software
as a way to potentially anticipate — and respond to — market shocks.
One, Tive, provides supply chain visibility insights that ostensibly help
companies manage their in-transit shipments’ location and condition.
Tive today announced that it raised $54 million in a Series B financing
round led by AXA Venture Partners with participation from Sorenson Capital,
Qualcomm Ventures, Fifth Wall, SJF Ventures and Floating Point Ventures,
which CEO Krenar Komoni's article attributes to the company’s growth over
the past year.
""",
"""
April 11, 2022 –Tive, the technology leader in the new era of supply chain
and logistics visibility, today announced the closing of a $54M Series B
funding led by AXA Venture Partners, with participation from Sorenson Capital,
Qualcomm Ventures, Fifth Wall, SJF Ventures and Floating Point Ventures as well
as the existing investors RRE Ventures, Two Sigma Ventures, NextView Ventures,
Hyperplane Ventures, Broom Ventures, and Supply Chain Ventures.
In 2021, Tive grew its revenue by over 300%, acquired more than 200 new
customers and expanded its global footprint. This latest investment will fuel
Tive's rapidly growing international presence, with the expansion of global
sales and marketing initiatives. In addition, it will accelerate the development
and introduction of next-generation solutions, services and bring actionable
supply chain intelligence and 24/7 monitoring to the market.
# 1 in Supply Chain and Logistics Condition and Location Monitoring
Tive continues to outpace and out-innovate the competition with the
most advanced multi-sensor trackers, a truly intuitive SaaS application,
and live 24/7 shipment monitoring service. As the leading provider
of supply chain tracking technology, Tive has delivered real-time
shipment visibility in more than 200 countries, and helped save thousands
of shipments from being delayed, damaged, spoiled, or rejected.
"""
]
response_body = process_texts(texts, "en_core_web_md")
resp = Response(json.dumps(response_body, cls=PythonObjectEncoder))
resp.headers['Content-Type'] = 'application/json'
gc.collect()
return resp
@application.errorhandler(429)
def ratelimit_handler(err):
"""
Log 429 and send JSON response
"""
logger.warning("Rate limit exceeded: %s, remote addr %s",
err.description, get_remote_address())
resp = Response(json.dumps({
'error': 'ratelimit exceeded',
'description': err.description
}, cls=PythonObjectEncoder))
resp.status_code = 429
resp.headers['Content-Type'] = 'application/json'
resp.headers['Cache-Control'] = 'no-cache, no-store, max-age=0'
resp.headers['Access-Control-Max-Age'] = '86400'
resp.headers['Access-Control-Allow-Origin'] = '*'
return resp
@application.after_request
def add_headers(response):
"""
Add limiter headers to response
"""
if limiter.current_limit:
response.headers["RemainingLimit"] = limiter.current_limit.remaining
response.headers["ResetAt"] = limiter.current_limit.reset_at
response.headers["MaxRequests"] = limiter.current_limit.limit.amount
response.headers["WindowSize"] = limiter.current_limit.limit.get_expiry()
response.headers["Breached"] = limiter.current_limit.breached
return response
@application.route("/process", methods=('POST',))
@limiter.limit(PROCESS_LIMIT)
def process():
"""
Process one or more texts, generating article analytics.
"""
start = time.time_ns()
params = request.get_json()
model = params.get("model") or "en_core_web_md"
keyword_model = params.get("keyword_model")
keywords = params.get("keywords") or KEYWORDS_DEFAULT
texts = params.get("texts") or []
ids = []
lang = params.get("language") or "en"
if keyword_model is None and lang != "en":
keyword_model = "paraphrase-multilingual-MiniLM-L12-v2"
if keyword_model is None:
keyword_model = KEYWORD_MODEL_DEFAULT
# Convert to tuples if necessary
as_tuples = False
if isinstance(texts[0], dict):
as_tuples = True
ids = [text["text_id"] for text in texts]
texts = [(text["text"], {"text_id": text["text_id"]})
for text in texts]
else:
ids = [str(i) for i in range(len(texts))]
logger.info("Processing texts (%s) with model %s limit remaining %s max %s",
ids, model,
limiter.current_limit.remaining,
limiter.current_limit.limit.amount)
try:
response_body = process_texts(
texts, model, keyword_model, keywords, lang, as_tuples)
except Exception as err: # pylint: disable=broad-except
logger.error("Error processing %s: %s", ids, err, exc_info=True)
response_body = {"error": str(err)}
resp = Response(json.dumps(response_body, cls=PythonObjectEncoder))
resp.headers['Content-Type'] = 'application/json'
resp.headers['Cache-Control'] = 'no-cache, no-store, max-age=0'
resp.headers['Access-Control-Max-Age'] = '86400'
resp.headers['Access-Control-Allow-Origin'] = '*'
end = time.time_ns()
logger.info("Processed %s in %i ms, memory usage: %s", ids, (end -
start) / 1000000, f"{psutil.virtual_memory()}")
return resp
if __name__ == "__main__":
# Setting debug to True enables debug output. This line should be
# removed before deploying a production app.
application.debug = True
application.run()