articles 2012年10月26日 by Drunkar

C++で「はじめての機械学習」2章, 3章

４章は判断木の構成で、データの属するカテゴリと各属性に対する判別結果を入力データとして与え、どの判断を用いて分類すればよいかの選択を支援するプログラムと、後半ではそれを確率探索しています。後半はまあいいかと思ったので前半部だけ。/** 判断木の構成を支援するプログラム (判断木を構成するためには、与えられたデータセットを適切に分類する属性を選択しなければならない。そこである属性を選んだ場合にどのように分類されるかを計算する。）入力は標準入力から与え、出力は標準出力に出力します使い方： ...

せかいらぼ

C++で「はじめての機械学習」5章

5章は遺伝的アルゴリズムです。ルーレット選択、一点交叉、突然変異、のsimpleGAです。最初は、目的関数最適化をめざすヒューリスティック。/** * simpleGA： * 1 遺伝子プールの初期化 * 2 loop{ * 交叉(親の選択を含む) * 突然変異 * 結果の出力 * } * 使い方： * $./simpleGA (遺伝子の桁数) (遺伝子の数) (終了条件スイッチ) (終了条件) > (出力ファイル) * 終了条件スイッチ：(終了条件)を指定。0->ループ回数、1->最高適応度下限 **/#include <iostream>#include <string>#include <limits>#incl...

せかいらぼ

C++で「はじめての機械学習」6章

与えられたデータセットと評価値に近づくように重みとしきい値をバックプロパゲーションで調整する３層パーセプトロン/** * バックプロパゲーションによるニューラルネットの学習 * 使い方 * $./backPropagation (変数の数) (中間層のセル数) (誤差の上限値) < (学習データセットのファイル名) *誤差の推移や、学習結果となる結合係数などを出力します **/#include <iostream>#include <vector>#include <cstdlib>#include <float.h>#include <ctime>#include <cmath>using namespace std;inl...

www.amazon.co.jp

Amazon.co.jp

はじめての機械学習という機械学習の基本をC言語のソース付きで学べる書籍を読んでいるのだけど、しーぷらぷらーの僕としてはむず痒さを感じることもあるので、C++で実装をしてみることにする。

Contents

2章最小二乗法

/**
  最小二乗法による計算式の決定
  y = a0 + a1*x　の a0, a1を出力
 **/

#include <iostream>
using namespace std;

int main()
{
    double xi, yi;                                        // 入力データ
    double sxi=0, syi=0, sxiyi=0, sxi2=0; // 各項の計算結果
    double a0, a1;                                      // 係数
    int    n = 0;                                           // データの個数

    // データの入力
    cout << "データの組を入力：xi yi" << endl
         << "(ctrl+d で入力終了)" << endl;
    while (true){
        cin >> xi >> yi;
        if(cin.eof()) break;
        if(cin.good()==false){
            cout << "無効なデータです" << endl;
            cin.clear();
            cin.ignore(256, 'n');
        } else {
            sxi   += xi;
            syi   += yi;
            sxiyi += xi*yi;
            sxi2  += xi*xi;
            ++n;
        }
    }

    if(n<2){
        cout << "データは2組以上必要です" << endl;
        return -1;
    }

    // 係数の計算
    a0 = (sxi2*syi-sxiyi*sxi) / (n*sxi2-sxi*sxi);
    a1 = (n*sxiyi-sxi*syi) / (n*sxi2-sxi*sxi);

    cout << "y = " << a1 << "x + " << a0 << endl;

    return 0;
}

3章 n-gram

utf-8テキストからn-gramを生成

/**
  ngramを作成するプログラム(utf-8)
  入力は標準入力から与え、出力は標準出力に出力します
  使い方：
    $./ngram (nの値) < (入力ファイル名) > (出力ファイル名)
  入力ファイルには、テキストファイルを指定します
  出力ファイルには、ngramを出力します
 **/

#include <iostream>
#include <string>
#include <vector>
#include <cstdlib>
using namespace std;

/**utf-8のstringを一文字ずつ分割する**/
vector<string> split_utf8(const string& str) {
    vector<string> result;
    string         tmp;
    bool           first = true;

    for (size_t i=0; i<=str.size(); ++i) {
        // 各バイトがutf-8の先頭かどうかをチェック
        if (first ||
                (i != str.size() && (str.at(i) & 0xC0) == 0x80)) {
            tmp += str.at(i);
            first = false;
            continue;
        }
        result.push_back(tmp);
        tmp.clear();
        if (i == str.size()) break;
        tmp += str.at(i);
    }
    return result;
}

/**ngramの出力**/
void printNgram(int n, const vector<string> &data)
{
    // 各文字についてその(n-1)文字後までbufferに追加
    for(int i=0; i<data.size()-n; ++i){
        string buffer;
        for(int j=i; j<i+n; ++j){
            buffer += data[j];
        }
        cout << buffer << endl;
    }
    cout << endl;
}

int main(int argc, char *argv[])
{
    int            n;       // ngramの長さ
    string         line;    // 行を格納
    vector<string> data;    // 全文章を一文字ずつ格納

    // 引数のチェック
    if(argc != 2){
        cerr << "使い方 $./ngram (nの値) "
            << "< (入力ファイル名) > (出力ファイル名)" << endl;
        return -1;
    }
    if((n=atoi(argv[1])) < 1){
        cerr << "nの値が不適切です" << endl;
        return -1;
    }

    // 一行ずつ取り出して各文字に分割
    while(getline(cin,  line)){
        vector<string> buffer = split_utf8(line);
        for(int i=0; i<buffer.size(); ++i){
            data.push_back(buffer[i]);
        }
    }

    // ngrmを生成
    printNgram(n,  data);

    return 0;
}

n-gramを集計

/**
  ngramの頻度分布を作成します
  標準入力から与え、出力は標準出力に出力します
  使い方：
    $./ranking < (入力ファイル名) > (出力ファイル名)
  入力ファイルには、ngramのファイルを指定します
  出力ファイルには、頻度分布を出力します
 **/

#include <iostream>
#include <cstdlib>
#include <sstream>
#include <string>
#include <vector>
#include <map>
#include <algorithm>
using namespace std;

string intToString(int num)
{
    stringstream ss;
    ss << num;
    return ss.str();
}

/**重複するngramを数えてresultに書き込む**/
map<int, string> countNgrams(const vector<string> &ngrams)
{
    map<int, string> result;                // <重複数, ngram>
    string           lastNgram = ngrams[0];
    int              numDuplicated = 1;

    // 辞書順にngramがソートされているので、
    // 一つ前のngramと同じなら重複数をインクリメントし、
    // 違えば一つ前のngramをresultに追加する
    for(int i=1; i<ngrams.size(); ++i){
        if(ngrams[i] == lastNgram) ++numDuplicated;
        else {
            // write result
            result.insert( map<int, string>::value_type(numDuplicated,  lastNgram) );
            lastNgram = ngrams[i];
            numDuplicated = 1;
        }
    }
    // write result(the last)
    result.insert( map<int, string>::value_type(numDuplicated,  lastNgram) );

    return result;
}

int main()
{
    vector<string>   ngrams;
    map<int, string> result; // <重複数, ngram>
    string           line;

    // read ngram
    while(getline(cin, line)) ngrams.push_back(line);

    sort(ngrams.begin(), ngrams.end());
    result = countNgrams(ngrams);

    // 降順で出力
    map<int, string>::iterator it = result.end();
    while(it != result.begin()){
        --it;
        cout << (*it).first << "t" << (*it).second << endl;
    }

    return 0;
}

ngramとかは結構おもしろい
続きはまた今度

※10月27日追記：n-gramの集計はmapだと回数が同じやつが許されないのでmultimapでやるべきでした。multimapバージョンは↓

/**
  ngramの頻度分布を作成します
  標準入力から与え、出力は標準出力に出力します
  使い方：
    $./ranking < (入力ファイル名) > (出力ファイル名)
  入力ファイルには、ngramのファイルを指定します
  出力ファイルには、頻度分布を出力します
 **/

#include <iostream>
#include <cstdlib>
#include <sstream>
#include <string>
#include <vector>
#include <map>
#include <algorithm>
using namespace std;

string intToString(int num)
{
    stringstream ss;
    ss << num;
    return ss.str();
}

/**重複するngramを数えてresultに書き込む**/
multimap<int, string> countNgrams(const vector<string> &ngrams)
{
    multimap<int, string> result;                // <重複数, ngram>
    string           lastNgram = ngrams[0];
    int              numDuplicated = 1;

    // 辞書順にngramがソートされているので、
    // 一つ前のngramと同じなら重複数をインクリメントし、
    // 違えば一つ前のngramをresultに追加する
    for(int i=1; i<ngrams.size(); ++i){
        if(ngrams[i] == lastNgram) ++numDuplicated;
        else {
            // write result
            result.insert( map<int, string>::value_type(numDuplicated,  lastNgram) );
            lastNgram = ngrams[i];
            numDuplicated = 1;
        }
    }
    // write result(the last)
    result.insert( map<int, string>::value_type(numDuplicated,  lastNgram) );

    return result;
}

int main()
{
    vector<string>   ngrams;
    multimap<int, string> result; // <重複数, ngram>
    string           line;

    // read ngram
    while(getline(cin, line)) ngrams.push_back(line);

    sort(ngrams.begin(), ngrams.end());
    result = countNgrams(ngrams);

    // 降順で出力
    map<int, string>::iterator it = result.end();
    while(it != result.begin()){
        --it;
        cout << (*it).first << "t" << (*it).second << endl;
    }

    return 0;
}

C++で「はじめての機械学習」2章, 3章

2章最小二乗法

3章 n-gram

utf-8テキストからn-gramを生成

n-gramを集計

コメント

コメントを返信するコメントをキャンセル

2章 最小二乗法

3章 n-gram

utf-8テキストからn-gramを生成

n-gramを集計

関連記事

fitbitの睡眠スコアを90弱で安定させる良い睡眠を続ける簡単な方法

m1 ipad pro 12.9 2021のusb-cハブはコレがベスト

Time Machine不要！Macを11.2.3にダウングレードして原神をm1 macbook airでプレイする

MH-Z19CとM5StickCで二酸化炭素濃度モニタリング

【神軽量HMD】Avegant Glyph 改造: 瓶詰堂さんのaltglyphを作った

PC、iPad、Android、switchもドックいらず！あまりに万能なusb-cハブが最強だった

コメント

コメントを返信する コメントをキャンセル

2章最小二乗法

コメントを返信するコメントをキャンセル