文本页面 (Text Page)

福昕 PDF SDK 提供 APIs 来提取，选择，搜索和检索 PDF 文档中的文本。 PDF 文本内容存储在与特定页面相关的 TextPage 对象中。TextPage 类可用于获取 PDF 页面中文本的信息，例如单个字符，单个单词，指定字符范围或矩形内的文本内容等。它还可用于构造其他文本相关类的对象，用来对文本内容执行更多操作或从文本内容访问指定信息（以JAVA 为例）：

在 PDF 页面的文本内容中搜索文本，使用 TextPage 对象来构建 TextSearch 对象。
访问类似超文本链接的文本，使用 TextPage 对象来构建 PageTextLinks 对象。

从 PDF 页面中提取文本

c++

#include "include/common/fs_common.h"
#include "include/pdf/fs_pdfdoc.h"
#include "include/pdf/fs_search.h"

using namespace std;
using namespace foxit;
using namespace foxit::common;
using foxit::common::Library;
using namespace pdf;
...

// Assuming PDFPage page has been loaded and parsed.

// Get the text page object.
TextPage text_page(page);
int count = text_page.GetCharCount();
if (count > 0) {
    WString text = text_page.GetChars();
    String s_text = text.UTF8Encode();
    fwrite((const char*)s_text, sizeof(char), s_text.GetLength(), file);
}
...

#include "include/fs_basictypes_c.h"
#include "include/fs_common_c.h"
#include "include/fs_pdfdoc_c.h"
#include "include/fs_search_c.h"

...

// Assuming FS_PDFPAGE_HANDLE page has been loaded and parsed.

// Get the text page object.
FS_TEXTPAGE_HANDLE text_page;
FSDK_TextPage_Create(page, e_FSTextParseFlagsParseTextNormal, &text_page);
int count;
FSDK_TextPage_GetCharCount(text_page, &count);
if (count > 0) {
FS_WSTR text;
FSDK_TextPage_GetChars(text_page, 0, -1, &text);
FS_BSTR text_bstr;
FSErrorCode code = FSDK_WStr_UTF8Encode(text, &text_bstr);
fwrite(text_bstr.str, sizeof(char), text_bstr.len, file);
FSDK_WStr_Clear(text);
}
...

java

import com.foxit.sdk.pdf.PDFDoc;
import com.foxit.sdk.pdf.TextPage;
...
// Assuming PDFPage page has been loaded and parsed.

// Get the text page object.
TextPage textpage = new TextPage(page, e_ParseTextNormal);
int nCharCount = textpage.getCharCount();
String texts = textpage.getChars(0, nCharCount);
...

import sys
import site

if sys.version_info.major == 2:
    _PYTHON2_ = True
else:
    _PYTHON2_ = False

if _PYTHON2_:
    # replace with the python2 lib path
    site.addsitedir(‘../../../’)
    from FoxitPDFSDKPython2 import *
else:
    from FoxitPDFSDKPython3 import *
...

# Assuming PDFPage page has been loaded and parsed.

# Get the text page object.
text_page = TextPage(page)
count = text_page.GetCharCount()
if count > 0:
    text = text_page.GetChars()
    if _PYTHON2_:
        file.write(text)
    else:
        file.write(bytes(text, encoding="utf-8"))
...

objc

#include "FSPDFObjC.h"
...

// Assuming FSPDFPage page has been loaded and parsed.

// Get the text page object.
FSTextPage *textPage = [[FSTextPage alloc] initWithPage:page flags:FSTextPageParseTextNormal];
int charCount = [textPage getCharCount];
if (charCount > 0) {
    NSString *currentText = [textPage getChars:0 count:-1];
}
...

const FSDK = require("@foxitsoftware/foxit-pdf-sdk-node");
const fs = require('fs');
...

// Assuming PDFPage page has been loaded and parsed.

// Get the text page object.
let text_page = new FSDK.TextPage(page, FSDK.TextPage.e_ParseTextNormal);
let count = text_page.GetCharCount();
if (count > 0) {
  let text = text_page.GetChars(0, -1);
  let buffer = new Buffer.from(text, 'utf-8');
  fs.writeSync(file, buffer, 0, buffer.length);
}
...

csharp

using foxit.common;
using foxit.pdf;
...

// Assuming PDFPage page has been loaded and parsed.

using (var text_page = new TextPage(page, (int)TextPage.TextParseFlags.e_ParseTextNormal))
{
int count = text_page.GetCharCount();
if (count > 0)
   {
              String chars = text_page.GetChars(0, count);
              writer.Write(chars);
           }
}
...

在 PDF 文档中获取矩形区域中的文本

c++

#include "include/common/fs_common.h"
#include "include/pdf/fs_pdfdoc.h"
#include "include/pdf/fs_search.h"

using namespace foxit;
using namespace foxit::common;
using foxit::common::Library;
using namespace pdf;
...

RectF rect; 
rect.left = 90; 
rect.right = 450;
rect.top = 595;
rect.bottom = 580;
TextPage textPage = new TextPage (&page, TextPage::e_ParseTextNormal);
textPage.GetTextInRect(&rect);
...

#include "include/fs_basictypes_c.h"
#include "include/fs_common_c.h"
#include "include/fs_pdfdoc_c.h"
#include "include/fs_search_c.h"
...

FSRectF rect; 
rect.left = 90; 
rect.right = 450;
rect.top = 595;
rect.bottom = 580;
FS_TEXTPAGE_HANDLE text_page;
FSDK_TextPage_Create(page, e_FSTextParseFlagsParseTextNormal, &text_page);
FS_WSTR text;
FSDK_TextPage_GetTextInRect(textPage, rect, &text);
...

java

import com.foxit.sdk.pdf.PDFDoc;
import com.foxit.sdk.pdf.TextPage;
import com.foxit.sdk.common.fxcrt.RectF;
import com.foxit.sdk.common.fxcrt.RectFArray;
...
// Assuming PDFPage page has been loaded and parsed.
...

TextPage textpage = new TextPage(page, e_ParseTextNormal);
RectF selRc = new RectF(100,100,250,250);
String selText = textpage.getTextInRect(selRc);
...

import sys
import site

if sys.version_info.major == 2:
    _PYTHON2_ = True
else:
    _PYTHON2_ = False

if _PYTHON2_:
    # replace with the python2 lib path
    site.addsitedir(‘../../../’)
    from FoxitPDFSDKPython2 import *
else:
    from FoxitPDFSDKPython3 import *
...

rect = RectF()
rect.left = 90 
rect.right = 450
rect.top = 595
rect.bottom = 580
textPage = TextPage(page, TextPage.e_ParseTextNormal)
textPage.GetTextInRect(rect)
...

objc

#include "FSPDFObjC.h"
...

FSTextPage *textPage = [[FSTextPage alloc] initWithPage:page flags:FSTextPageParseTextNormal];
FSRectF* rect = [[FSRectF alloc] initWithLeft1:90 bottom1:580 right1:450 top1:595];
NSString* text = [textPage getTextInRect:rect];

const FSDK = require("@foxitsoftware/foxit-pdf-sdk-node");
...

let rect = new FSDK.RectF();
rect.left = 90;
rect.right = 450;
rect.top = 595;
rect.bottom = 580;
let text_page = new FSDK.TextPage(page, FSDK.TextPage.e_ParseTextNormal);
textPage.GetTextInRect(rect)
...

csharp

using foxit.common;
using foxit.pdf;
using foxit.common.fxcrt;
...

RectF rect = new RectF(100, 50, 220, 100);
TextPage text_page = new TextPage(page, (int)foxit.pdf.TextPage.TextParseFlags.e_ParseTextNormal);
String str_text = text_page.GetTextInRect(rect);
...

文本页面 (Text Page) ​

从 PDF 页面中提取文本 ​

在 PDF 文档中获取矩形区域中的文本 ​

文本页面 (Text Page)

从 PDF 页面中提取文本

在 PDF 文档中获取矩形区域中的文本