文本页面 (Text Page)
福昕 PDF SDK 提供 APIs 来提取,选择,搜索和检索 PDF 文档中的文本。 PDF 文本内容存储在与特定页面相关的 TextPage 对象中。TextPage 类可用于获取 PDF 页面中文本的信息,例如单个字符,单个单词,指定字符范围或矩形内的文本内容等。它还可用于构造其他文本相关类的对象,用来对文本内容执行更多操作或从文本内容访问指定信息(以JAVA 为例):
- 在 PDF 页面的文本内容中搜索文本,使用
TextPage
对象来构建TextSearch
对象。 - 访问类似超文本链接的文本,使用
TextPage
对象来构建PageTextLinks
对象。
从 PDF 页面中提取文本
c++
#include "include/common/fs_common.h"
#include "include/pdf/fs_pdfdoc.h"
#include "include/pdf/fs_search.h"
using namespace std;
using namespace foxit;
using namespace foxit::common;
using foxit::common::Library;
using namespace pdf;
...
// Assuming PDFPage page has been loaded and parsed.
// Get the text page object.
TextPage text_page(page);
int count = text_page.GetCharCount();
if (count > 0) {
WString text = text_page.GetChars();
String s_text = text.UTF8Encode();
fwrite((const char*)s_text, sizeof(char), s_text.GetLength(), file);
}
...
C
#include "include/fs_basictypes_c.h"
#include "include/fs_common_c.h"
#include "include/fs_pdfdoc_c.h"
#include "include/fs_search_c.h"
...
// Assuming FS_PDFPAGE_HANDLE page has been loaded and parsed.
// Get the text page object.
FS_TEXTPAGE_HANDLE text_page;
FSDK_TextPage_Create(page, e_FSTextParseFlagsParseTextNormal, &text_page);
int count;
FSDK_TextPage_GetCharCount(text_page, &count);
if (count > 0) {
FS_WSTR text;
FSDK_TextPage_GetChars(text_page, 0, -1, &text);
FS_BSTR text_bstr;
FSErrorCode code = FSDK_WStr_UTF8Encode(text, &text_bstr);
fwrite(text_bstr.str, sizeof(char), text_bstr.len, file);
FSDK_WStr_Clear(text);
}
...
java
import com.foxit.sdk.pdf.PDFDoc;
import com.foxit.sdk.pdf.TextPage;
...
// Assuming PDFPage page has been loaded and parsed.
// Get the text page object.
TextPage textpage = new TextPage(page, e_ParseTextNormal);
int nCharCount = textpage.getCharCount();
String texts = textpage.getChars(0, nCharCount);
...
py
import sys
import site
if sys.version_info.major == 2:
_PYTHON2_ = True
else:
_PYTHON2_ = False
if _PYTHON2_:
# replace with the python2 lib path
site.addsitedir(‘../../../’)
from FoxitPDFSDKPython2 import *
else:
from FoxitPDFSDKPython3 import *
...
# Assuming PDFPage page has been loaded and parsed.
# Get the text page object.
text_page = TextPage(page)
count = text_page.GetCharCount()
if count > 0:
text = text_page.GetChars()
if _PYTHON2_:
file.write(text)
else:
file.write(bytes(text, encoding="utf-8"))
...
objc
#include "FSPDFObjC.h"
...
// Assuming FSPDFPage page has been loaded and parsed.
// Get the text page object.
FSTextPage *textPage = [[FSTextPage alloc] initWithPage:page flags:FSTextPageParseTextNormal];
int charCount = [textPage getCharCount];
if (charCount > 0) {
NSString *currentText = [textPage getChars:0 count:-1];
}
...
js
const FSDK = require("@foxitsoftware/foxit-pdf-sdk-node");
const fs = require('fs');
...
// Assuming PDFPage page has been loaded and parsed.
// Get the text page object.
let text_page = new FSDK.TextPage(page, FSDK.TextPage.e_ParseTextNormal);
let count = text_page.GetCharCount();
if (count > 0) {
let text = text_page.GetChars(0, -1);
let buffer = new Buffer.from(text, 'utf-8');
fs.writeSync(file, buffer, 0, buffer.length);
}
...
csharp
using foxit.common;
using foxit.pdf;
...
// Assuming PDFPage page has been loaded and parsed.
using (var text_page = new TextPage(page, (int)TextPage.TextParseFlags.e_ParseTextNormal))
{
int count = text_page.GetCharCount();
if (count > 0)
{
String chars = text_page.GetChars(0, count);
writer.Write(chars);
}
}
...
在 PDF 文档中获取矩形区域中的文本
c++
#include "include/common/fs_common.h"
#include "include/pdf/fs_pdfdoc.h"
#include "include/pdf/fs_search.h"
using namespace foxit;
using namespace foxit::common;
using foxit::common::Library;
using namespace pdf;
...
RectF rect;
rect.left = 90;
rect.right = 450;
rect.top = 595;
rect.bottom = 580;
TextPage textPage = new TextPage (&page, TextPage::e_ParseTextNormal);
textPage.GetTextInRect(&rect);
...
C
#include "include/fs_basictypes_c.h"
#include "include/fs_common_c.h"
#include "include/fs_pdfdoc_c.h"
#include "include/fs_search_c.h"
...
FSRectF rect;
rect.left = 90;
rect.right = 450;
rect.top = 595;
rect.bottom = 580;
FS_TEXTPAGE_HANDLE text_page;
FSDK_TextPage_Create(page, e_FSTextParseFlagsParseTextNormal, &text_page);
FS_WSTR text;
FSDK_TextPage_GetTextInRect(textPage, rect, &text);
...
java
import com.foxit.sdk.pdf.PDFDoc;
import com.foxit.sdk.pdf.TextPage;
import com.foxit.sdk.common.fxcrt.RectF;
import com.foxit.sdk.common.fxcrt.RectFArray;
...
// Assuming PDFPage page has been loaded and parsed.
...
TextPage textpage = new TextPage(page, e_ParseTextNormal);
RectF selRc = new RectF(100,100,250,250);
String selText = textpage.getTextInRect(selRc);
...
py
import sys
import site
if sys.version_info.major == 2:
_PYTHON2_ = True
else:
_PYTHON2_ = False
if _PYTHON2_:
# replace with the python2 lib path
site.addsitedir(‘../../../’)
from FoxitPDFSDKPython2 import *
else:
from FoxitPDFSDKPython3 import *
...
rect = RectF()
rect.left = 90
rect.right = 450
rect.top = 595
rect.bottom = 580
textPage = TextPage(page, TextPage.e_ParseTextNormal)
textPage.GetTextInRect(rect)
...
objc
#include "FSPDFObjC.h"
...
FSTextPage *textPage = [[FSTextPage alloc] initWithPage:page flags:FSTextPageParseTextNormal];
FSRectF* rect = [[FSRectF alloc] initWithLeft1:90 bottom1:580 right1:450 top1:595];
NSString* text = [textPage getTextInRect:rect];
js
const FSDK = require("@foxitsoftware/foxit-pdf-sdk-node");
...
let rect = new FSDK.RectF();
rect.left = 90;
rect.right = 450;
rect.top = 595;
rect.bottom = 580;
let text_page = new FSDK.TextPage(page, FSDK.TextPage.e_ParseTextNormal);
textPage.GetTextInRect(rect)
...
csharp
using foxit.common;
using foxit.pdf;
using foxit.common.fxcrt;
...
RectF rect = new RectF(100, 50, 220, 100);
TextPage text_page = new TextPage(page, (int)foxit.pdf.TextPage.TextParseFlags.e_ParseTextNormal);
String str_text = text_page.GetTextInRect(rect);
...