2.3. C++でコマンドを作成する/Write a command in C++

C++で書かれたソースコードをコンパイルして、いつでも使えるコマンドとして、PATHで指定されたディレクトリに格納するための手順を紹介します。

Here we will show how to compile a C++ program and store it into a directory that is specified by the PATH environment variable, so that you can use it as a command at any time.

2.3.1. ソースコード/The source code

題材に使うのは、次のプログラムです。これは、ソースコードの文字コードをチェックするプログラムです。演習の中でソースを提出する前に、このプログラムでチェックしてください。長いのでタイプしてもらう必要はありません。

The following program is the subject. This is a character code checker program. Use this program to check your source when you submit source code during the seminar. You don’t need to type in the code.

#include <iostream>
#include <fstream>
#include <cassert>
#include <cstdlib>

/* check the encoding of one file */
bool checkOneFile(const char *filename);

/* check the encoding of files */
int main(int argc, const char *argv[])
{
    /* loop through the arguments */
    for (int i = 1; i < argc; i++) {
        bool res = checkOneFile(argv[i]);
        if (! res) {
            /* failure (following linux convention) */
            return EXIT_FAILURE;
        }
    }
    /* success (following linux convention) */
    return EXIT_SUCCESS;
}

/* states for the state machine to check source code bytes */
enum State {
    S_NORMAL,   /* during a line, after a regular character */
    S_BOL,      /* beginning of line */
    S_CR,       /* after a CR */
    S_UTF_EXPECT3,  /* expecting 3 more UTF8 trailers to follow */
    S_UTF_EXPECT2,  /* expecting 2 more */
    S_UTF_EXPECT1,  /* expecting 1 more */
};

void report(int incident_count, const char *msg, const char *filename, int line_count) {
    /* report the incident just once per file. */
    if (incident_count == 1) {
        std::cout << filename << "(" << (line_count+1) << ") [ERROR] :" << msg << std::endl;
    }
}

bool isUtf8Trailer(char c) {
    // check if the byte has the bits 10xx xxxx
    return (c & 0xc0) == 0x80;
    // 0xc0 is "C0" in hexadecimal. C is 12. (0123456789ABCDEF are the digits)
    // 12 is 2^3 + 2^2. in binary it is 1100
    // 0xc0 in binary is 11000000
    // 0x80 in binary is 10000000
}

bool isUtf8Header1(char c) {
    // 110x xxxx
    return (c & 0xe0) == 0xc0;
}

bool isUtf8Header2(char c) {
    // 1110 xxxx
    return (c & 0xf0) == 0xe0;
}

bool isUtf8Header3(char c) {
    // 1111 0xxx
    return (c & 0xf8) == 0xf0;
}

bool checkOneFile(const char *filename)
{
    std::cout << "Checking " << filename << std::endl;
    std::ifstream ifs(filename, std::ios::binary);
    if (! ifs.is_open()) {
        /* standard C function to print an error message */
        perror(filename);
        return false;
    }
    /* number of End Of Line(EOL) characters we have seen */
    int line_count = 0;
    /* number of tab characters (banned by our coding convention). */
    int tab_count = 0;
    /* number of CR-LF sequences (banned) */
    int crlf_count = 0;
    /* number of CR occurences (banned) */
    int cr_count = 0;
    /* number of other C1 segment control characters (0x01 through 0x1f) */
    /* They do not appear in normal text files */
    int c1_count = 0;
    /* number of bad UTF-8 sequences. Happens when character code setting of the editor is incorrect */
    int bad_utf_count = 0;

    /* state variable for the state machine */
    State ss = S_BOL;

    /*
     * when an ifstream class object is referenced in an if condition,
     * ifstream will be casted into a bool type by a custom-made
     * cast function.  The bool value 'true' will mean that the
     * ifstream object is in good state. False will mean that no more
     * data can be obtained from the stream.
     */
    while (ifs) {
        char c; // note that this is a 'signed' type and the range is -128 to +127
        // read one byte from the stream.
        ifs.get(c);

        // reenter point, when state transition happens without
        // consuming the input character
        reenter_point:

        switch (ss) {
            case S_BOL:
                ss = S_NORMAL;
                /* fall through */
            case S_NORMAL:
            {
                switch(c) {
                    case '\n':
                        line_count++;
                        ss = S_BOL;
                        break;
                    case '\r':
                        ss = S_CR;
                        break;
                    case '\t':
                        tab_count++;
                        report(tab_count, "Tab character", filename, line_count);
                        break;
                    default:
                        if (isUtf8Header3(c)) {
                            ss = S_UTF_EXPECT3;
                        } else if (isUtf8Header2(c)) {
                            ss = S_UTF_EXPECT2;
                        } else if (isUtf8Header1(c)) {
                            ss = S_UTF_EXPECT1;
                        } else if (isUtf8Trailer(c)) {
                            bad_utf_count++;
                            report(bad_utf_count, "Bad multibyte sequence", filename, line_count);
                        } else if (((unsigned char)c) < 0x20) {
                            c1_count++;
                            report(c1_count, "Unexpected control character", filename, line_count);
                        }
                        break;
                }
            }
            break;
            case S_CR:
            {
                if (c == '\n') {
                    // CRLF = Windows EOL code
                    crlf_count++;
                    report(crlf_count, "Windows newline sequence (CR,LF)", filename, line_count);
                    line_count++;
                    ss = S_BOL;
                } else {
                    // CR = Very Old MacOS EOL code
                    cr_count++;
                    report(cr_count, "Old-time MacOS newline sequence (CR)", filename, line_count);
                    line_count++;
                    ss = S_BOL;
                    /* We do not consume the current cc. */
                    goto reenter_point;
                }
            }
            break;
            case S_UTF_EXPECT3:
            {
                if (isUtf8Trailer(c)) {
                    // correct UTF-8 sequence.
                    ss = S_UTF_EXPECT2;
                } else {
                    bad_utf_count++;
                    report(bad_utf_count, "Bad multibyte sequence", filename, line_count);
                    ss = S_NORMAL;
                }
            }
            break;
            case S_UTF_EXPECT2:
            {
                if (isUtf8Trailer(c)) {
                    // correct UTF-8 sequence.
                    ss = S_UTF_EXPECT1;
                } else {
                    bad_utf_count++;
                    report(bad_utf_count, "Bad multibyte sequence", filename, line_count);
                    ss = S_NORMAL;
                }
            }
            break;
            case S_UTF_EXPECT1:
            {
                if (isUtf8Trailer(c)) {
                    // correct UTF-8 sequence.
                    ss = S_NORMAL;
                } else {
                    bad_utf_count++;
                    report(bad_utf_count, "Bad multibyte sequence", filename, line_count);
                    ss = S_NORMAL;
                }
            }
            break;
            default:
                assert(0);
        }
    }
    if (ss != S_BOL) {
        report(1, "Missing EOL at end of file", filename, line_count);
    }
    /*
     * the ifs object should be in eof (end of file) state now.
     * If it isn't, that means some other error happened.
     */
    if (! ifs.eof()) {
        perror(filename);
        return false;
    }
    ifs.close();
    return true;
}

Download the source code

2.3.2. 作業用のディレクトリを用意する/Prepare a directory for the work

プログラムをコンパイルするために、ホームディレクトリの下にディレクトリを作ります。

Make a directory inside your home directory to compile the program.

Note

ホームディレクトリに、過去の授業や演習などで作成したファイルが残っている場合には、それらを格納するためのディレクトリをまず作って、過去のファイルはそこにしまっておきましょう。不要なファイルでしたら消してしまいましょう。

If your home directory is scattered with files from former classes, seminars, or any other activities, make a directory to stow those files away. If you think they are unnecessary, you can choose to delete them.

ここではディレクトリ名を tools/source_checker とします。

Name the directory tools/source_checker:

$ cd
$ mkdir -p tools/source_checker
$ cd tools/source_checker

引数なしで cd を使うと、ホームディレクトリに移動します。また、 mkdir に -p オプションをつけると、ディレクトリの階層を一度に作ることができます。

Using cd with no arguments will take you to your home directory. By giving the -p option to the mkdir command, you can make multiple levels of directories at once.

Note

本文書に従って操作するときはは、（マウスでコピー＆ペーストするのではなく）、実際にタイプしてみて下さい。タイプすると覚えます。マウスでなぞっても、マウスのなぞり方しか記憶に残らないようです。タイプして覚えると、タイプを省力化するためのいろいろな仕掛けを活用できるようになります。

When you are following the instructions in this document, we urge you to actually type the commands, rather than copy-and-pasting them with a mouse. You will memorize by typing. Dragging the mouse only lets you memorize how to drag the mouse. Once you memorize through typing, you will be able to leverage the various tools to reduce the amount of typing.

2.3.3. ソースコードをそこに配置する/Place the source code

ソースコードのダウンロードリンクが、このページのソースコードのリストの末尾にあります。作成したディレクトリにダウンロードしたソースを格納して下さい。

もしもブラウザでデフォルトのダウンロードディレクトリに格納した場合には、 mv で source_checker ディレクトリに移動させてみてください。ブラウザのダウンロード先ディレクトリはOSやブラウザによって異なるので、自分で調べてください。

Get the source file from the download link at the end of the above source listing and store it in the directory you just created.

If you stored the file to the browser’s default download directory, try moving that file to the ‘source_checker’ directory with the mv command. The default download destination directory differs depending on browsers and OSes, so check for yourself:

$ pwd                                              # make sure you are in the right directory
/home/.../your-username/tools/source_checker
$ mv ~/Downloads/source_checker.cpp .

pwd は、print working directory の略で、シェルのカレントディレクトリを確認するために使えます。念の為、どこにいるのか確認したいときに使います。

mv の後のチルダ “~” はシェルによってホームディレクトリのパス名に展開されます。”~ユーザ名” とすると、任意のユーザのホームディレクトリを得ることができます。

mv の最後の “.” はカレントディレクトリのパス名です。ファイルの移動先としてカレントディレクトリを指定しています。

pwd is short for “print working directory”, and it does just that. You can use it to confirm where your shell is looking at.

The tilde “~” after mv will be expanded to your home directory path name by the shell. The form “~username” can also be used and will be expanded to the home directory specified by the user name.

The “.” at the end of the mv command line is the path name for the current directory. The current directory is specified as the destination of the move operation.

2.3.4. コンパイルする/Compile the code

C++のソースコードをコンパイルするにはC++用のコンパイラを使います。

コンパイラの種類がいろいろありますが、ここではLinux上で広く使える g++ を使います。

To compile C++ source code, you need to use a C++ compiler.

There are many compilers available, but here we will use g++, which is widely available on Linux:

$ pwd
/home/.../your-user-name/tools/source_checker
$ g++ -o source_checker source_checker.cpp

存在しているパス名をタイプするときは、TABキーによる入力補完が効くことが多いので試してみてください。これから作り出すファイルについては無理です。

When you type an existing pathname, keep in mind that type assisting (input completion) is available through the tab key. However, this will not work for pathnames you are going to create.

2.3.5. 実行する/Run the program

プログラムを実行するには、プログラムのファイル名をシェルに入力します。シェルはコマンド名を受け取ると、環境変数PATHに列挙されたディレクトリにコマンドを探しに行きます。”ls” コマンドなどはそのように実行されます。一方、パスセパレータ “/” を含んだパス名をコマンド名として入力すると、環境変数PATHとは関係なく、そのパス名を探しに行きます。

今、コンパイルして、出来上がったばかりの source_checker プログラムのファイル名だけを入力すると、シェルはPATHの中を探しに行き、おそらくエラーになります。

手元のディレクトリにあるファイルを実行するには、プログラムのファイル名だけを入力したのではだめで、パスセパレータ ‘/’を1つ以上含んだパス名を指定する必要があります。”カレントディレクトリにある source_checker” でしたら、 ./source_checker がパス名になります。

To run a program, you enter the file name of the program to the shell. The shell will search for that file in the directories listed in the environment variable PATH. The “ls” command and others are executed in this way. On the other hand when you enter a path name of a file which includes at least one path separator “/”, the shell will look directly at that pathname without using PATH.

If you enter the file name “source_checker” into the shell, the shell will search that name in PATH, likely resulting in an error.

To run a program file in the current directory, it is not enough to enter the file name. You must enter a path name with at least one path separator. “the source_checker file in the current directory” can be expressed by the path “./source_checker”:

$ ls                        # make sure we are in the correct directory
source_checker  source_checker.cpp
% ./source_checker source_checker.cpp
Checking source_checker.cpp

2.3.6. PATH経由で使えるようにする/make it usable via PATH

環境変数PATHで指定されたディレクトリにプログラムを置いておけば、カレントディレクトリがどこであろうと、すぐにコマンドとして使うことができます。

ここでは、ホームディレクトリの下に自作コマンド用のディレクトリを設けて、環境変数PATHにそのディレクトリを指定してみます。

If you store your program in a directory that is included in the environment variable PATH, you will be able to invoke it as a command no matter where the current directory of your shell is.

Here we will create a directory inside your home directory for storing your own commands, and point to that directory from PATH.

まず、ホームディレクトリの下に .local/bin というディレクトリを作ります。このディレクトリ名を使うツールも存在するので、人によっては最初から出来ているかも知れません。

First, we make a directory .local/bin under your home directory. This directory name is used by some tools, so some of you may already have this directory:

$ ls                        # make sure we are in the correct directory
source_checker  source_checker.cpp
$ mkdir -p ~/.local/bin
$ cp source_checker ~/.local/bin

次に、環境変数PATHにこのディレクトリを登録します。環境変数の設定の仕方についてのチュートリアルも合わせて読んでください。

Next, we add an entry for this new directory in PATH. Also read the tutorial on environment variables.

$ PATH=$PATH:~/.local/bin $ export PATH

これで準備ができました。いまや先頭の “./” をつけずにコマンドとして起動できます。

We are done. You no longer need to add the “./” in front of the file name:

$ source_checker *.cpp
Checking source_checker.cpp

2.3.7. プログラムの中身/The content of the program

プログラムの中身について簡単に説明します。

このプログラムは文字コードのチェックツールであり、シミュレーション計算と直接の関係はありませんが、シミュレーションプログラムの実装にも役立つテクニックをいくつか使っています。

このチュートリアルに取り組んだ時点で難しいと感じる場合は、少しC++言語に慣れてから読んでみてください。

We will briefly explain the content of the program.

This program is a character code checker tool, and is not related with simulation calculation. However it uses some techniques that can be applied to simulation programs.

If you find the content difficult at the time you work on this tutorial, you can revisit the content later when you get more used to the C++ language.

2.3.7.1. main関数/The main function

/* check the encoding of files */
int main(int argc, const char *argv[])
{
    /* loop through the arguments */
    for (int i = 1; i < argc; i++) {
        bool res = checkOneFile(argv[i]);
        if (! res) {
            /* failure (following linux convention) */
            return EXIT_FAILURE;
        }
    }
    /* success (following linux convention) */
    return EXIT_SUCCESS;

main関数は、argvで渡されたパス名をひとつずつ checkOneFile 関数に渡します。 checkOneFileは、パス名の誤りなどによって処理に失敗した場合にはfalseを返します。処理に失敗した時点でループは終了します。

main関数の戻り値は、プロセスの終了ステータス値として、呼び出し元の親プロセス（典型的にはシェル）に伝達されます。 Linuxも従う POSIX規格では終了ステータスの値として 0または EXIT_SUCCESS は正常終了を意味し、 EXIT_ERROR は、異常終了を意味します。

The main function will loop through the path names given as argv, and calls the checkOneFile function for each path name. checkOneFile will return false on a failure. It can fail if the given pathname is not readable. Upon failure, the loop will be terminated.

The return value of the main function will become the exit status code of the process and will be sent to the parent process, which typically is a shell.

Acoording to the POSIX Standard, which Linux complies, the exit status value 0, or EXIT_SUCCESS denotes a success, and EXIT_ERROR denotes an error.

2.3.7.2. The checkOneFile function

bool checkOneFile(const char *filename)
{
    std::cout << "Checking " << filename << std::endl;
    std::ifstream ifs(filename, std::ios::binary);
    if (! ifs.is_open()) {
        /* standard C function to print an error message */
        perror(filename);
        return false;
    }
    /* number of End Of Line(EOL) characters we have seen */
    int line_count = 0;
    /* number of tab characters (banned by our coding convention). */
    int tab_count = 0;
    /* number of CR-LF sequences (banned) */
    int crlf_count = 0;
    /* number of CR occurences (banned) */
    int cr_count = 0;
    /* number of other C1 segment control characters (0x01 through 0x1f) */
    /* They do not appear in normal text files */
    int c1_count = 0;
    /* number of bad UTF-8 sequences. Happens when character code setting of the editor is incorrect */
    int bad_utf_count = 0;

    /* state variable for the state machine */
    State ss = S_BOL;

    /*
     * when an ifstream class object is referenced in an if condition,
     * ifstream will be casted into a bool type by a custom-made
     * cast function.  The bool value 'true' will mean that the
     * ifstream object is in good state. False will mean that no more
     * data can be obtained from the stream.
     */
    while (ifs) {
        char c; // note that this is a 'signed' type and the range is -128 to +127
        // read one byte from the stream.
        ifs.get(c);

checkOneFile関数では、ファイルから1バイトずつ読み込むために、std::ifstream クラスの変数 ifs を作ります。コンストラクタ引数にはファイル名と、バイナリモードを指定するフラグを渡しています。バイナリモードを指定しないと、ifstreamは空白、タブ、改行などのいわゆる whitespace character を全て区切り文字として読み飛ばしてしまいます。このプログラムではタブ文字を検出したり、改行を数えたりしたいので、バイナリモードを指定します。

ifstreamに指定したパス名に問題があると、ファイルのオープンに失敗します。成功したか失敗したかを、is_open() メソッドで確認しています。 perror関数は、C/C++標準ライブラリの中で生じたエラーの種類を説明するメッセージを印字する関数です。perror関数の引数には、ファイル名など、そのときの操作の対象だったモノの名前を渡します。この名前は、メッセージの一部に使われます。

メッセージとしては、以下のようなものが得られます:

no such file or directory: spel_misteik.cpp

ファイルのオープンに成功したら、while文の中で1バイトずつファイルを読んでいきます。 whileの条件には変数 ifs をそのまま渡しています。 while ( 式 ) の式の部分には論理値 (bool型の値) が求められます。 ifsはbool型ではなく std::ifstream クラスです。std::ifstreamクラスには、bool 型への変換関数が設けられており、その時点でストリームからデータをさらに読み出すことができれば true, 読み出すことができなければ false を返すようになっています。そのため、この書き方で、「ファイルからデータが読み取れる間は繰り返す」と書いたことになります。

関数の残りの部分では、ファイルに登場する文字コードのチェックをしています。

以下の事項をチェックしています。

タブ文字は登場しないか。
改行文字として、 Unix形式の改行以外のものが使われていないか。
改行以外の制御コード(Backspace, Beep, Form feed等)が登場していないか。
マルチバイト文字は UTF-8 の規則に則っているか。Windowsの漢字コードを使うと、このチェックに抵触します。 * ファイルの最後の改行の後に、文字がないか。つまり、最終行の末尾の改行を忘れていないか。

    while (ifs) {
        char c; // note that this is a 'signed' type and the range is -128 to +127
        // read one byte from the stream.
        ifs.get(c);

        // reenter point, when state transition happens without
        // consuming the input character
        reenter_point:

        switch (ss) {
            case S_BOL:
                ss = S_NORMAL;
                /* fall through */
            case S_NORMAL:
            {
                switch(c) {
                    case '\n':
                        line_count++;
                        ss = S_BOL;
                        break;
                    case '\r':
                        ss = S_CR;
                        break;
                    case '\t':
                        tab_count++;
                        report(tab_count, "Tab character", filename, line_count);
                        break;
                    default:
                        if (isUtf8Header3(c)) {
                            ss = S_UTF_EXPECT3;
                        } else if (isUtf8Header2(c)) {
                            ss = S_UTF_EXPECT2;
                        } else if (isUtf8Header1(c)) {
                            ss = S_UTF_EXPECT1;
                        } else if (isUtf8Trailer(c)) {
                            bad_utf_count++;
                            report(bad_utf_count, "Bad multibyte sequence", filename, line_count);
                        } else if (((unsigned char)c) < 0x20) {
                            c1_count++;
                            report(c1_count, "Unexpected control character", filename, line_count);
                        }
                        break;
                }
            }
            break;
            case S_CR:
            {
                if (c == '\n') {
                    // CRLF = Windows EOL code
                    crlf_count++;
                    report(crlf_count, "Windows newline sequence (CR,LF)", filename, line_count);
                    line_count++;
                    ss = S_BOL;
                } else {
                    // CR = Very Old MacOS EOL code
                    cr_count++;
                    report(cr_count, "Old-time MacOS newline sequence (CR)", filename, line_count);
                    line_count++;
                    ss = S_BOL;
                    /* We do not consume the current cc. */
                    goto reenter_point;
                }
            }
            break;
            case S_UTF_EXPECT3:
            {
                if (isUtf8Trailer(c)) {
                    // correct UTF-8 sequence.
                    ss = S_UTF_EXPECT2;
                } else {
                    bad_utf_count++;
                    report(bad_utf_count, "Bad multibyte sequence", filename, line_count);
                    ss = S_NORMAL;
                }
            }
            break;
            case S_UTF_EXPECT2:
            {
                if (isUtf8Trailer(c)) {
                    // correct UTF-8 sequence.
                    ss = S_UTF_EXPECT1;
                } else {
                    bad_utf_count++;
                    report(bad_utf_count, "Bad multibyte sequence", filename, line_count);
                    ss = S_NORMAL;
                }
            }
            break;
            case S_UTF_EXPECT1:
            {
                if (isUtf8Trailer(c)) {
                    // correct UTF-8 sequence.
                    ss = S_NORMAL;
                } else {
                    bad_utf_count++;
                    report(bad_utf_count, "Bad multibyte sequence", filename, line_count);
                    ss = S_NORMAL;
                }
            }
            break;
            default:
                assert(0);
        }
    }
    if (ss != S_BOL) {
        report(1, "Missing EOL at end of file", filename, line_count);
    }
    /*
     * the ifs object should be in eof (end of file) state now.
     * If it isn't, that means some other error happened.
     */
    if (! ifs.eof()) {
        perror(filename);
        return false;
    }
    ifs.close();
    return true;
}