From 0a39a91b4613ce88d60d13290011d1701675f2f1 Mon Sep 17 00:00:00 2001 From: stet-stet Date: Thu, 27 Feb 2020 00:17:30 +0900 Subject: [PATCH] add correct mecab installation instructions --- README.md | 4 +++- install_external_tools.sh | 7 ++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 803ba26d..9b421742 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ be found in [6], together with an extensive experimental evaluation. * [Faiss](https://github.com/facebookresearch/faiss), for fast similarity search and bitext mining * [transliterate 1.10.2](https://pypi.org/project/transliterate), only used for Greek (`pip install transliterate`) * [jieba 0.39](https://pypi.org/project/jieba/), Chinese segmenter (`pip install jieba`) -* [mecab 0.996](https://pypi.org/project/JapaneseTokenizer/), Japanese segmenter +* [mecab 0.996](https://pypi.org/project/JapaneseTokenizer/), segmenter only used for Japanese. * tokenization from the Moses encoder (installed automatically) * [FastBPE](https://github.com/glample/fastBPE), fast C++ implementation of byte-pair encoding (installed automatically) @@ -46,6 +46,8 @@ be found in [6], together with an extensive experimental evaluation. `export LASER="${HOME}/projects/laser"` * download encoders from Amazon s3 by `bash ./install_models.sh` * download third party software by `bash ./install_external_tools.sh` + * If you do not have `unzip` on your system, run `sudo apt install unzip` beforehand. + * If your task involves Japanese text, install mecab manually by following the instructions that appear at the end. * download the data used in the example tasks (see description for each task) ## Applications diff --git a/install_external_tools.sh b/install_external_tools.sh index b250982c..8b2a960d 100755 --- a/install_external_tools.sh +++ b/install_external_tools.sh @@ -166,6 +166,11 @@ InstallFastBPE echo "" echo "automatic installation of the Japanese tokenizer mecab may be tricky" echo "Please install it manually from https://github.com/taku910/mecab" -echo "" echo "The installation directory should be ${LASER}/tools-external/mecab" echo "" +echo "When configuring mecab prior to installation, please enable utf8 output by running the following." +echo "for mecab: " +echo "./configure --enable-utf8-only" +echo "for mecab-ipadic: " +echo "./configure --with-charset=utf8" +echo ""