1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
7 #ifndef CORE_SRC_FPDFTEXT_TEXT_INT_H_
8 #define CORE_SRC_FPDFTEXT_TEXT_INT_H_
10 #include "../../include/fpdftext/fpdf_text.h"
11 #include "../../include/fxcrt/fx_basic.h"
14 class CPDF_DocProgressiveSearch;
15 class CPDF_FormObject;
16 class CPDF_LinkExtract;
17 class CPDF_TextPageFind;
19 #define FPDFTEXT_CHAR_ERROR -1
20 #define FPDFTEXT_CHAR_NORMAL 0
21 #define FPDFTEXT_CHAR_GENERATED 1
22 #define FPDFTEXT_CHAR_UNUNICODE 2
23 #define FPDFTEXT_CHAR_HYPHEN 3
24 #define FPDFTEXT_CHAR_PIECE 4
25 #define FPDFTEXT_MC_PASS 0
26 #define FPDFTEXT_MC_DONE 1
27 #define FPDFTEXT_MC_DELAY 2
29 typedef struct _PAGECHAR_INFO {
35 CFX_FloatRect m_CharBox;
36 CPDF_TextObject* m_pTextObj;
37 CFX_AffineMatrix m_Matrix;
40 typedef CFX_SegmentedArray<PAGECHAR_INFO> PAGECHAR_InfoArray;
45 typedef CFX_ArrayTemplate<FPDF_SEGMENT> SEGMENT_Array;
47 CPDF_TextObject* m_pTextObj;
48 CFX_AffineMatrix m_formMatrix;
50 typedef CFX_ArrayTemplate<PDFTEXT_Obj> LINEOBJ;
52 class CPDF_TextPage : public IPDF_TextPage {
54 CPDF_TextPage(const CPDF_Page* pPage, int flags);
55 ~CPDF_TextPage() override {}
58 FX_BOOL ParseTextPage() override;
59 void NormalizeObjects(FX_BOOL bNormalize) override;
60 bool IsParsed() const override { return m_bIsParsed; }
61 int CharIndexFromTextIndex(int TextIndex) const override;
62 int TextIndexFromCharIndex(int CharIndex) const override;
63 int CountChars() const override;
64 void GetCharInfo(int index, FPDF_CHAR_INFO& info) const override;
65 void GetRectArray(int start,
67 CFX_RectArray& rectArray) const override;
68 int GetIndexAtPos(CPDF_Point point,
70 FX_FLOAT yTolerance) const override;
71 int GetIndexAtPos(FX_FLOAT x,
74 FX_FLOAT yTolerance) const override;
75 CFX_WideString GetTextByRect(const CFX_FloatRect& rect) const override;
76 void GetRectsArrayByRect(const CFX_FloatRect& rect,
77 CFX_RectArray& resRectArray) const override;
78 CFX_WideString GetPageText(int start = 0, int nCount = -1) const override;
79 int CountRects(int start, int nCount) override;
80 void GetRect(int rectIndex,
84 FX_FLOAT& bottom) const override;
85 FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate) override;
86 FX_BOOL GetBaselineRotate(const CFX_FloatRect& rect, int& Rotate) override;
87 int CountBoundedSegments(FX_FLOAT left,
91 FX_BOOL bContains = FALSE) override;
92 void GetBoundedSegment(int index, int& start, int& count) const override;
93 int GetWordBreak(int index, int direction) const override;
95 const PAGECHAR_InfoArray* GetCharList() const { return &m_charList; }
96 static FX_BOOL IsRectIntersect(const CFX_FloatRect& rect1,
97 const CFX_FloatRect& rect2);
98 static FX_BOOL IsLetter(FX_WCHAR unicode);
101 FX_BOOL IsHyphen(FX_WCHAR curChar);
102 bool IsControlChar(const PAGECHAR_INFO& charInfo);
103 FX_BOOL GetBaselineRotate(int start, int end, int& Rotate);
104 void ProcessObject();
105 void ProcessFormObject(CPDF_FormObject* pFormObj,
106 const CFX_AffineMatrix& formMatrix);
107 void ProcessTextObject(PDFTEXT_Obj pObj);
108 void ProcessTextObject(CPDF_TextObject* pTextObj,
109 const CFX_AffineMatrix& formMatrix,
111 int ProcessInsertObject(const CPDF_TextObject* pObj,
112 const CFX_AffineMatrix& formMatrix);
113 FX_BOOL GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info);
114 FX_BOOL IsSameAsPreTextObject(CPDF_TextObject* pTextObj, FX_POSITION ObjPos);
115 FX_BOOL IsSameTextObject(CPDF_TextObject* pTextObj1,
116 CPDF_TextObject* pTextObj2);
117 int GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const;
118 void CloseTempLine();
119 void OnPiece(CFX_BidiChar* pBidi, CFX_WideString& str);
120 int32_t PreMarkedContent(PDFTEXT_Obj pObj);
121 void ProcessMarkedContent(PDFTEXT_Obj pObj);
122 void CheckMarkedContentObject(int32_t& start, int32_t& nCount) const;
123 void FindPreviousTextObject(void);
124 void AddCharInfoByLRDirection(CFX_WideString& str, int i);
125 void AddCharInfoByRLDirection(CFX_WideString& str, int i);
126 int32_t GetTextObjectWritingMode(const CPDF_TextObject* pTextObj);
127 int32_t FindTextlineFlowDirection();
128 void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend);
129 FX_BOOL IsRightToLeft(const CPDF_TextObject* pTextObj,
130 const CPDF_Font* pFont,
133 CPDFText_ParseOptions m_ParseOptions;
134 CFX_WordArray m_CharIndex;
135 const CPDF_PageObjects* const m_pPage;
136 PAGECHAR_InfoArray m_charList;
137 CFX_WideTextBuf m_TextBuf;
138 PAGECHAR_InfoArray m_TempCharList;
139 CFX_WideTextBuf m_TempTextBuf;
140 const int m_parserflag;
141 CPDF_TextObject* m_pPreTextObj;
142 CFX_AffineMatrix m_perMatrix;
144 CFX_AffineMatrix m_DisplayMatrix;
145 SEGMENT_Array m_Segment;
146 CFX_RectArray m_SelRects;
148 int32_t m_TextlineDir;
149 CFX_FloatRect m_CurlineRect;
152 class CPDF_TextPageFind : public IPDF_TextPageFind {
154 explicit CPDF_TextPageFind(const IPDF_TextPage* pTextPage);
155 ~CPDF_TextPageFind() override {}
158 FX_BOOL FindFirst(const CFX_WideString& findwhat,
160 int startPos = 0) override;
161 FX_BOOL FindNext() override;
162 FX_BOOL FindPrev() override;
163 void GetRectArray(CFX_RectArray& rects) const override;
164 int GetCurOrder() const override;
165 int GetMatchedCount() const override;
168 void ExtractFindWhat(const CFX_WideString& findwhat);
169 FX_BOOL IsMatchWholeWord(const CFX_WideString& csPageText,
172 FX_BOOL ExtractSubString(CFX_WideString& rString,
173 const FX_WCHAR* lpszFullString,
176 CFX_WideString MakeReverse(const CFX_WideString& str);
177 int ReverseFind(const CFX_WideString& csPageText,
178 const CFX_WideString& csWord,
181 int GetCharIndex(int index) const;
184 CFX_WordArray m_CharIndex;
185 const IPDF_TextPage* m_pTextPage;
186 CFX_WideString m_strText;
187 CFX_WideString m_findWhat;
189 CFX_WideStringArray m_csFindWhatArray;
192 FX_BOOL m_bMatchCase;
193 FX_BOOL m_bMatchWholeWord;
196 CFX_RectArray m_resArray;
205 CFX_WideString m_strUrl;
206 virtual ~CPDF_LinkExt() {}
209 typedef CFX_ArrayTemplate<CPDF_LinkExt*> LINK_InfoArray;
211 class CPDF_LinkExtract : public IPDF_LinkExtract {
214 ~CPDF_LinkExtract() override;
217 FX_BOOL ExtractLinks(const IPDF_TextPage* pTextPage) override;
218 int CountLinks() const override;
219 CFX_WideString GetURL(int index) const override;
220 void GetBoundedSegment(int index, int& start, int& count) const override;
221 void GetRects(int index, CFX_RectArray& rects) const override;
223 FX_BOOL IsExtract() const { return m_bIsParsed; }
227 void DeleteLinkList();
228 FX_BOOL CheckWebLink(CFX_WideString& strBeCheck);
229 FX_BOOL CheckMailLink(CFX_WideString& str);
230 void AppendToLinkList(int start, int count, const CFX_WideString& strUrl);
233 LINK_InfoArray m_LinkList;
234 const CPDF_TextPage* m_pTextPage;
235 CFX_WideString m_strPageText;
239 FX_STRSIZE FX_Unicode_GetNormalization(FX_WCHAR wch, FX_WCHAR* pDst);
240 void NormalizeString(CFX_WideString& str);
241 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest);
242 void GetTextStream_Unicode(CFX_WideTextBuf& buffer,
243 CPDF_PageObjects* pPage,
245 CFX_PtrArray* pObjArray);
247 #endif // CORE_SRC_FPDFTEXT_TEXT_INT_H_