PDF アプリケーションサンプル

PDFファイルを開き、それを構文解析するサンプルアプリケーションを公開します。ここで示すアプリケーションは、PDF構文を説明するためのものですので、すべてのPDFファイルを構文解析できるものではありません。
このサンプルは、PDF Toolsとは無関係です。PDF構文を説明するための簡単なサンプルアプリケーションの一部です。

前へ <<<

３. PDFオブジェクトの解析

PDFオブジェクト（Boolean、Injteger、Real、String、Hexadecimal、Name、Array、Dictionary、Reference、の各オブジェクト）を構文解析するメソッドです。

int ParseObject(int *offs, int *type, int *len)
{
    BYTE    *pdf = data.pBlobData;
    int     size = 0;
    int     hierarchy;

    while(strchr(WHITE_SPACE,(int)pdf[*offs])) (*offs)++;

各オブジェクトを識別する定数を以下のように定義しています。

#define PDF_OBJ_BOOL    1    // 1:Boolean
#define PDF_OBJ_NUM     2    // 2:Number
#define PDF_OBJ_INT     3    // 4:Integer
#define PDF_OBJ_REAL    4    // 3:Real
#define PDF_OBJ_STR     5    // 5:String
#define PDF_OBJ_HEXA    6    // 6:Hexadecimal
#define PDF_OBJ_NAME    7    // 7:Name
#define PDF_OBJ_ARRAY   8    // 8:Array
#define PDF_OBJ_DIC     9    // 9:Dictionary
#define PDF_OBJ_UNDEF  10    //10:
#define PDF_OBJ_REF    11    //11:Reference
#define PDF_OBJ_NULL   12    //12:Null

３．１ディクショナリ（Dictionary Object）の構文解析

ディクショナリ（Dictionary Object）は、「<<」と「>>」で囲まれたオブジェクトで名前オブジェクトと値のオブジェクトが対となって格納されています。ここではその内容の解析はせず、文字列として解析します。
ただし、値オブジェクトとしてディクショナリも格納できますので、それを考慮してオブジェクトの終端を検索します。

    if((pdf[*offs]=='<')&&(pdf[(*offs)+1]=='<')){
        //ディクショナリ
        if(type) *type = PDF_OBJ_DIC;
        size = 2;

        //オブジェクトの終わり">>"を探す
        hierarchy = 0;
        while(((*offs)+size)<(int)data.cbSize){
            if(pdf[(*offs)+size]=='\\') size++;
            if(!strncmp("<<", (char*)(pdf+(*offs)+size), 2)){
                hierarchy++;
                size++; if(((*offs)+size)>(int)data.cbSize) return -1;
            }else if(!strncmp(">>", (char*)(pdf+(*offs)+size), 2)){
                if(!hierarchy){
                    *len = size+2;
                    return 0;
                }
                hierarchy--;
            }
            size++;
        }
        return -1;                    //Error
    }

３．２配列（Array Object）の構文解析

配列（Array Object）は、「[」と「]」で囲まれたオブジェクトで、値のオブジェクトが順に格納されています。
ただし、値オブジェクトとして配列も格納できますので、それを考慮してオブジェクトの終端を検索します。

    else if(pdf[*offs]=='['){
        //配列
        if(type) *type = PDF_OBJ_ARRAY;
        size = 2;

        //オブジェクトの終わり"]"を探す
        hierarchy = 0;
        while(((*offs)+size)<(int)data.cbSize){
            if(pdf[(*offs)+size]=='\\'){ size++; if(((*offs)+size)>(int)data.cbSize) return -1; }
            if(pdf[(*offs)+size]=='['){ hierarchy++; }
            else if(pdf[(*offs)+size]==']'){
                if(!hierarchy){
                    *len = size+1;
                    return 0;
                }
                hierarchy--;
            }
            size++;
        }
        return -1;                    //Error
    }

３．３文字列（String Object）の構文解析

文字列（String Object）は、「(」と「)」で囲まれたオブジェクトです。
ただし、内部の「(」と「)」の対は文字列として認識されますので、それを考慮してオブジェクトの終端を検索します。

    else if(pdf[*offs]=='('){
        //文字列
        if(type) *type = PDF_OBJ_STR;
        size = 1;

        //オブジェクトの終わり")"を探す
        hierarchy = 0;
        while(((*offs)+size)<(int)data.cbSize){
            if(pdf[(*offs)+size]=='\\'){
                size++; if(((*offs)+size)>(int)data.cbSize) return -1;
                if((pdf[(*offs)+size]>='0')&&(pdf[(*offs)+size]<='7')){
                    //Octal
                    for(int i=0; i<3; i++){
                        size++; if(((*offs)+size)>(int)data.cbSize) return -1;
                        if((pdf[(*offs)+size]<'0')||(pdf[(*offs)+size]>'7')) break;
                    }
                }else
                    size++; if(((*offs)+size)>(int)data.cbSize) return -1;
            }
            if(pdf[(*offs)+size]=='('){ hierarchy++; }
            else if(pdf[(*offs)+size]==')'){
                if(!hierarchy){
                    *len = size+1;
                    return 0;
                }
                hierarchy--;
            }
            size++;
        }
        return -1;                                            //Error
    }

３．４１６進数（Hexadecimal Object）の構文解析

文字列（String Object）は、「<」と「>」で囲まれたオブジェクトです。このオブジェクトには、数字の0~9とa~fおよびA~F以外は含まれませんので、それを考慮しています。

    else if(pdf[*offs]=='<'){
        //Hexadecimal
        if(type) *type = PDF_OBJ_HEXA;
        size = 1;

        //オブジェクトの終わり">"を探す
        while(((*offs)+size)<(int)data.cbSize){
            if(pdf[(*offs)+size]=='>'){
                *len = size+1;
                return 0;
            }else if(((pdf[(*offs)+size]<'0')||(pdf[(*offs)+size]>'9'))
                &&((pdf[(*offs)+size]<'a')||(pdf[(*offs)+size]>'f'))
                &&((pdf[(*offs)+size]<'A')||(pdf[(*offs)+size]>'F'))) return -1;                //Error
            size++;
        }
    }

３．５真偽値（Boolean Object）の構文解析

真偽値（Boolean Object）は、「true」と「false」のいずれかです。真偽値は、ディクショナリや配列の構成要素としてのみ意味がありますので、ホワイトスペースもしくはデリミタで終端されます。

    else if(!strncmp((char*)(pdf+(*offs)),"true",4)){
        //真偽
        size = 4;
obj_boolean:
        if(!strchr(WHITE_SPACE DELIMITER,(int)pdf[(*offs)+size])) return -1;    //Error
        if(type) *type = PDF_OBJ_HEXA;
        *len = size;
        return 0;
    }else if(!strncmp((char*)(pdf+(*offs)),"false",5)){
        //真偽
        size = 5;
        goto obj_boolean;
    }

デリミタは以下のように定義されます。

#define DELIMITER    "()<>[]{}/%"

３．６名前（Name Object）の構文解析

名前（Name Object）は、「/」で開始するオブジェクトです。単独で存在しませんので、必ずホワイトスペースまたは、デリミタが後に続きます。

    else if(pdf[*offs]=='/'){
        //名前
        size = 1;
        while(((*offs)+size)<(int)data.cbSize){
            if(strchr(WHITE_SPACE DELIMITER,(int)pdf[(*offs)+size])){
                if(type) *type = PDF_OBJ_NAME;
                *len = size;
                return 0;
            }
            size++;
        }
        return -1;        //Error
    }

３．７数字（実数、整数）または、参照（Real, Integer, Refarence Object）の構文解析

実数、整数、参照は数字を含んだオブジェクトです。それぞれの特徴を考慮して解析しています。

    else if((pdf[*offs]=='+')||(pdf[*offs]=='-')){
        size = 1;
        if(pdf[(*offs)+size]=='.'){
            //符号付実数
obj_real:
            size++;
            if((pdf[(*offs)+size]>='0')&&(pdf[(*offs)+size]<='9')){
                while((pdf[(*offs)+size]>='0')&&(pdf[(*offs)+size]<='9')) size++;
                if(strchr(WHITE_SPACE DELIMITER,(int)pdf[(*offs)+size])){
                    //実数
                    if(type) *type = PDF_OBJ_REAL;
                    *len = size;
                    return 0;
                }else return -1;                                            //Error
            }else if(strchr(WHITE_SPACE DELIMITER,(int)pdf[(*offs)+size])){
                    //実数
                if(type) *type = PDF_OBJ_REAL;
                *len = size;
                return 0;
            }else return -1;                                                //Error
        }else if((pdf[(*offs)+size]<'0')||(pdf[(*offs)+size]>'9')) return -1;        //Error
        while((pdf[(*offs)+size]>='0')&&(pdf[(*offs)+size]<='9')) size++;
        if(strchr(WHITE_SPACE DELIMITER,(int)pdf[(*offs)+size])){
            //符号付整数
                if(type) *type = PDF_OBJ_INT;
                *len = size;
                return 0;
        }else if(pdf[(*offs)+size]=='.'){
            //符号付実数
            goto obj_real;
        }
    }else if(pdf[*offs]=='.'){
        //実数
        goto obj_real;
    }else if((pdf[*offs]>='0')&&(pdf[*offs]<='9')){
        //整数、実数またはリファレンスのいずれか
        for(size=1; /*((*offs)+size)<trailerInfo.len*/; size++){
            if(pdf[(*offs)+size]=='.'){
                //実数
                goto obj_real;
            }else if((pdf[(*offs)+size]>='0')&&(pdf[(*offs)+size]<='9')){
                //数字を読み飛ばす
                while((pdf[(*offs)+size]>='0')&&(pdf[(*offs)+size]<='9')) size++;
                if(pdf[(*offs)+size]=='.'){
                    //実数
                    goto obj_real;
                }else if(strchr(WHITE_SPACE,(int)pdf[(*offs)+size])){
                    //white-spaceを一時的に読み飛ばす
                    int        tmp;
                    for(tmp=1; strchr(WHITE_SPACE,(int)pdf[(*offs)+size+tmp]); tmp++);
                    if(strchr(DELIMITER,(int)pdf[(*offs)+size+tmp])){
                        //整数
                        if(type) *type = PDF_OBJ_INT;
                        *len = size;
                        return 0;
                    }else{
                        //リファレンス
                        while(pdf[(*offs)+size]!='R') size++;
                        if(type) *type = PDF_OBJ_REF;
                        *len = size + 1;
                        return 0;
                    }
                }else if(strchr(DELIMITER,(int)pdf[(*offs)+size])){
                    if(type) *type = PDF_OBJ_INT;
                    *len = size;
                    return 0;
                }else return -1;                                        // Error
            }else if(strchr(WHITE_SPACE,(int)pdf[(*offs)+size])){
                //white-spaceを一時的に読み飛ばす
                int        tmp;
                for(tmp=1; strchr(WHITE_SPACE,(int)pdf[(*offs)+size+tmp]); tmp++);
                if(strchr(DELIMITER,(int)pdf[(*offs)+size+tmp])){
                    //整数
                    if(type) *type = PDF_OBJ_INT;
                    *len = size;
                    return 0;
                }else{
                    //リファレンス
                    while(pdf[(*offs)+size]!='R') size++;
                    if(type) *type = PDF_OBJ_REF;
                    *len = size + 1;
                    return 0;
                }
            }else {
                //整数
                if(type) *type = PDF_OBJ_INT;
                *len = size;
                return 0;
            }
        }
    }

３．８エラー

いずれのオブジェクトにも合致しなかった場合は、エラーとして処理します。

    return -1;    //Error
}

サンプルアプリケーションについて

PDFデータを解析できるサンプルアプリケーションを公開しています。PDFParse17_1_0.zip(Winfows版)をダウンロードして、お手元のPDFデータを解析してください。このアプリケーションのソースコードは、PDF Tools SDKをご購入されたお客様のご希望により開示しています。詳細はお問い合わせください（メール）。

(記載の会社名および製品名は、各社の登録商標および商標です。)

PDF アプリケーション サンプル

３. PDFオブジェクトの解析

３．１ ディクショナリ（Dictionary Object）の構文解析

３．２ 配列（Array Object）の構文解析

３．３ 文字列（String Object）の構文解析

３．４ １６進数（Hexadecimal Object）の構文解析

３．５ 真偽値（Boolean Object）の構文解析

３．６ 名前（Name Object）の構文解析

３．７ 数字（実数、整数）または、参照（Real, Integer, Refarence Object）の構文解析

３．８ エラー

サンプル アプリケーションについて