Revise documentation

caiyishu · Nov 4, 2021 · 439711e · 439711e
1 parent f02ac9b
commit 439711e
Show file tree

Hide file tree

Showing 8 changed files with 27 additions and 10 deletions.
diff --git a/docs/api/restful_java.md b/docs/api/restful_java.md
@@ -6,15 +6,15 @@ Add the following dependency into the `pom.xml` file of your project.
 <dependency>
   <groupId>com.hankcs.hanlp.restful</groupId>
   <artifactId>hanlp-restful</artifactId>
-  <version>0.0.4</version>
+  <version>0.0.7</version>
 </dependency>
 ```
 
 Obtain an `auth` from any compatible service provider like our [free service](https://bbs.hankcs.com/t/apply-for-free-hanlp-restful-apis/3178), then initiate a `HanLPClient` and call its `parse` interface.
 
 ```java
 HanLPClient client = new HanLPClient("https://hanlp.hankcs.com/api", null); // Replace null with your auth
-System.out.println(client.parse("2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。英首相与特朗普通电话讨论华为与苹果公司。"));
+System.out.println(client.parse("2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。"));
 ```
 
 Refer to our [testcases](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_restful_java/src/test/java/com/hankcs/hanlp/restful/HanLPClientTest.java) and [data format](../data_format) for more details.

diff --git a/docs/conf.py b/docs/conf.py
@@ -1,5 +1,4 @@
 # -- Project information -----------------------------------------------------
-import os
 import sys
 import os
 
@@ -74,7 +73,7 @@
 # a list of builtin themes.
 #
 html_theme = "sphinx_book_theme"
-html_title = "HanLP documentation"
+html_title = "HanLP Documentation"
 html_logo = "_static/logo.png"
 html_favicon = "_static/favicon.png"
 html_copy_source = True

diff --git a/docs/data_format.md b/docs/data_format.md
@@ -78,7 +78,7 @@ For example, the following RESTful codes will output such an instance.
 :tags: [output_scroll]
 from hanlp_restful import HanLPClient
 HanLP = HanLPClient('https://hanlp.hankcs.com/api', auth=None)  # Fill in your auth
-print(HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。英首相与特朗普通电话讨论华为与苹果公司。'))
+print(HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。'))
 ```
 
 The outputs above is represented as a `json` dictionary where each key is a task name and its value is 

diff --git a/docs/tutorial.md b/docs/tutorial.md
@@ -85,7 +85,7 @@ which offers visualization in any mono-width text environment.
 
 ````{margin} **Non-ASCII**
 ```{note}
-Non-ASCII text might screw in which case copying it into a `.tsv` editor will align it correctly. 
+Non-ASCII text might screw in terminals but in Jupyter Notebook it will align correctly. 
 You can also use our [live demo](https://hanlp.hankcs.com/).
 ```
 ````

diff --git a/hanlp/components/mtl/tasks/pos.py b/hanlp/components/mtl/tasks/pos.py
@@ -76,7 +76,7 @@ def __init__(self,
                  **kwargs) -> None:
         """A simple tagger using a linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer for
         any tagging tasks including PoS tagging and many others. It also features with a custom dictionary ``dict_tags``
-        to perform ``longest-prefix-matching`` and replaced matched tokens with given tags.
+        to perform ``longest-prefix-matching`` which replaces matched tokens with given tags.
 
 
         .. Note:: For algorithm beginners, longest-prefix-matching is the prerequisite to understand what dictionary can

diff --git a/plugins/hanlp_restful/hanlp_restful/__init__.py b/plugins/hanlp_restful/hanlp_restful/__init__.py
@@ -242,14 +242,15 @@ def coreference_resolution(self, text: Optional[str] = None, tokens: Optional[Li
                                          'language': language or self._language})
         return response
 
-    def tokenize(self, text: Union[str, List[str]], coarse: Optional[bool] = None) -> List[List[str]]:
+    def tokenize(self, text: Union[str, List[str]], coarse: Optional[bool] = None, language=None) -> List[List[str]]:
         """ Split a document into sentences and tokenize them. Note that it is always faster to tokenize a whole
         document than to tokenize each sentence one by one. So avoid calling this method sentence by sentence but put
         sentences into a ``list`` and pass them to the ``text`` argument.
 
         Args:
             text: A document (``str``), or a list of sentences (``List[str]``).
             coarse: Whether to perform coarse-grained or fine-grained tokenization.
+            language: The language of input text. ``None`` to use the default language.
 
         Examples::
 
@@ -274,8 +275,21 @@ def tokenize(self, text: Union[str, List[str]], coarse: Optional[bool] = None) -
             [['商品', '和', '服务', '。'],
              ['当', '下雨天', '地面', '积水', '分', '外', '严重']]
 
+            # Multilingual tokenization by specifying language='mul':
+            HanLP.tokenize(
+                ['In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques
+                 'to production environment.',
+                 '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',
+                 '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。'], language='mul')
+            [['In', '2021', ',', 'HanLPv2.1', 'delivers', 'state-of-the-art', 'multilingual',
+              'NLP', 'techniques', 'to', 'production', 'environment', '.'],
+             ['2021', '年', '、', 'HanLPv2.1', 'は', '次', '世代', 'の', '最', '先端', '多',
+              '言語', 'NLP', '技術', 'を', '本番', '環境', 'に', '導入', 'します', '。'],
+             ['2021', '年', 'HanLPv2.1', '为', '生产', '环境', '带来', '次世代', '最', '先进的',
+              '多', '语种', 'NLP', '技术', '。']]
+
         Returns:
             A list of tokenized sentences.
         """
-        doc = self.parse(text=text, tasks='tok/coarse' if coarse is True else 'tok')
+        doc = self.parse(text=text, tasks='tok/coarse' if coarse is True else 'tok', language=language)
         return next(iter(doc.values()))
diff --git a/plugins/hanlp_restful/setup.py b/plugins/hanlp_restful/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name='hanlp_restful',
-    version='0.0.8',
+    version='0.0.9',
     description='HanLP: Han Language Processing',
     long_description=long_description,
     long_description_content_type="text/markdown",

diff --git a/plugins/hanlp_restful/tests/test_client.py b/plugins/hanlp_restful/tests/test_client.py
@@ -34,6 +34,10 @@ def test_tokenize(self):
         print(self.HanLP.tokenize('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司'))
         print(self.HanLP.tokenize('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司', coarse=True))
         print(self.HanLP.tokenize(['商品和服务。', '当下雨天地面积水分外严重']))
+        print(self.HanLP.tokenize(
+            ['In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environment.',
+             '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',
+             '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。'], language='mul'))
 
     def test_coreference_resolution(self):
         print(self.HanLP.coreference_resolution('我姐送我她的猫。我很喜欢它。'))