a0af51eb602ba56a44d183661541755ef50be645
[pdfium.git] / core / src / fpdftext / text_int.h
1 // Copyright 2014 PDFium Authors. All rights reserved.\r
2 // Use of this source code is governed by a BSD-style license that can be\r
3 // found in the LICENSE file.\r
4  \r
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com\r
6 \r
7 #ifndef _PDF_TEXT_INT_H_\r
8 #define _PDF_TEXT_INT_H_\r
9 class CPDF_TextParseOptions : public CFX_Object\r
10 {\r
11 public:\r
12     CPDF_TextParseOptions();\r
13     FX_BOOL                     m_bCheckObjectOrder;\r
14     FX_BOOL                     m_bCheckDirection;\r
15     int                         m_nCheckSameObject;\r
16 };\r
17 class CPDF_TextPage;\r
18 class CPDF_LinkExtract;\r
19 class CPDF_TextPageFind;\r
20 class CPDF_DocProgressiveSearch;\r
21 #define FPDFTEXT_CHAR_ERROR                     -1\r
22 #define FPDFTEXT_CHAR_NORMAL            0\r
23 #define FPDFTEXT_CHAR_GENERATED         1\r
24 #define FPDFTEXT_CHAR_UNUNICODE         2\r
25 #define FPDFTEXT_CHAR_HYPHEN            3\r
26 #define FPDFTEXT_CHAR_PIECE                     4\r
27 #define FPDFTEXT_MC_PASS                        0\r
28 #define FPDFTEXT_MC_DONE                        1\r
29 #define FPDFTEXT_MC_DELAY                       2\r
30 typedef struct _PAGECHAR_INFO: public CFX_Object {\r
31     int                                 m_CharCode;\r
32     FX_WCHAR                    m_Unicode;\r
33     FX_FLOAT                    m_OriginX;\r
34     FX_FLOAT                    m_OriginY;\r
35     FX_INT32                    m_Flag;\r
36     CFX_FloatRect               m_CharBox;\r
37     CPDF_TextObject*    m_pTextObj;\r
38     CFX_AffineMatrix    m_Matrix;\r
39     int                                 m_Index;\r
40 } PAGECHAR_INFO;\r
41 typedef CFX_SegmentedArray<PAGECHAR_INFO> PAGECHAR_InfoArray;\r
42 typedef struct {\r
43     int m_Start;\r
44     int m_nCount;\r
45 } FPDF_SEGMENT;\r
46 typedef CFX_ArrayTemplate<FPDF_SEGMENT> SEGMENT_Array;\r
47 typedef struct {\r
48     CPDF_TextObject*    m_pTextObj;\r
49     CFX_AffineMatrix    m_formMatrix;\r
50 } PDFTEXT_Obj;\r
51 typedef CFX_ArrayTemplate<PDFTEXT_Obj> LINEOBJ;\r
52 class CPDF_TextPage: public IPDF_TextPage\r
53 {\r
54 public:\r
55     CPDF_TextPage(const CPDF_Page* pPage, int flags = 0);\r
56     CPDF_TextPage(const CPDF_PageObjects* pPage, int flags = 0);\r
57     CPDF_TextPage(const CPDF_Page* pPage, CPDFText_ParseOptions ParserOptions);\r
58     virtual FX_BOOL                                     ParseTextPage();\r
59     virtual void                                        NormalizeObjects(FX_BOOL bNormalize);\r
60     virtual     FX_BOOL                                 IsParsered() const\r
61     {\r
62         return m_IsParsered;\r
63     }\r
64     virtual ~CPDF_TextPage() {};\r
65 public:\r
66     virtual int CharIndexFromTextIndex(int TextIndex)const ;\r
67     virtual int TextIndexFromCharIndex(int CharIndex)const;\r
68     virtual int                                         CountChars() const;\r
69     virtual     void                                    GetCharInfo(int index, FPDF_CHAR_INFO & info) const;\r
70     virtual void                                        GetRectArray(int start, int nCount, CFX_RectArray& rectArray) const;\r
71     virtual int                                         GetIndexAtPos(CPDF_Point point, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const;\r
72     virtual int                                         GetIndexAtPos(FX_FLOAT x, FX_FLOAT y, FX_FLOAT xTorelance,\r
73             FX_FLOAT yTorelance) const;\r
74     virtual CFX_WideString                      GetTextByRect(CFX_FloatRect rect) const;\r
75     virtual void                                        GetRectsArrayByRect(CFX_FloatRect rect, CFX_RectArray& resRectArray) const;\r
76     virtual     int                                             GetOrderByDirection(int order, int direction) const;\r
77     virtual     CFX_WideString                  GetPageText(int start = 0, int nCount = -1) const;\r
78 \r
79     virtual int                                         CountRects(int start, int nCount);\r
80     virtual     void                                    GetRect(int rectIndex, FX_FLOAT& left, FX_FLOAT& top\r
81                                             , FX_FLOAT& right, FX_FLOAT &bottom) const;\r
82     virtual FX_BOOL                                     GetBaselineRotate(int rectIndex, int& Rotate);\r
83     virtual FX_BOOL                                     GetBaselineRotate(CFX_FloatRect rect, int& Rotate);\r
84     virtual     int                                             CountBoundedSegments(FX_FLOAT left, FX_FLOAT top,\r
85             FX_FLOAT right, FX_FLOAT bottom, FX_BOOL bContains = FALSE);\r
86     virtual     void                                    GetBoundedSegment(int index, int& start, int& count) const;\r
87     virtual int                                         GetWordBreak(int index, int direction) const;\r
88 public:\r
89     const       PAGECHAR_InfoArray*             GetCharList() const\r
90     {\r
91         return &m_charList;\r
92     }\r
93     static      FX_BOOL                                 IsRectIntersect(CFX_FloatRect rect1, CFX_FloatRect rect2);\r
94     static      FX_BOOL                                 IsLetter(FX_WCHAR unicode);\r
95 private:\r
96     FX_BOOL                                                     IsHyphen(FX_WCHAR curChar);\r
97     FX_BOOL                                                     IsControlChar(PAGECHAR_INFO* pCharInfo);\r
98     FX_BOOL                                                     GetBaselineRotate(int start, int end, int& Rotate);\r
99     void                                                        ProcessObject();\r
100     void                                                        ProcessFormObject(CPDF_FormObject*      pFormObj, CFX_AffineMatrix formMatrix);\r
101     void                                                        ProcessTextObject(PDFTEXT_Obj pObj);\r
102     void                                                        ProcessTextObject(CPDF_TextObject*      pTextObj, CFX_AffineMatrix formMatrix, FX_POSITION ObjPos);\r
103     int                                                         ProcessInsertObject(const CPDF_TextObject* pObj, CFX_AffineMatrix formMatrix);\r
104     FX_BOOL                                                     GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info);\r
105     FX_BOOL                                                     IsSameAsPreTextObject(CPDF_TextObject* pTextObj, FX_POSITION ObjPos);\r
106     FX_BOOL                                                     IsSameTextObject(CPDF_TextObject* pTextObj1, CPDF_TextObject* pTextObj2);\r
107     int                                                         GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const;\r
108     void                                                        CloseTempLine();\r
109     void                                                        OnPiece(IFX_BidiChar* pBidi, CFX_WideString& str);\r
110     FX_INT32    PreMarkedContent(PDFTEXT_Obj pObj);\r
111     void                ProcessMarkedContent(PDFTEXT_Obj pObj);\r
112     void                CheckMarkedContentObject(FX_INT32& start, FX_INT32& nCount) const;\r
113     void                FindPreviousTextObject(void);\r
114     void                AddCharInfoByLRDirection(CFX_WideString& str, int i);\r
115     void                AddCharInfoByRLDirection(CFX_WideString& str, int i);\r
116     FX_INT32    GetTextObjectWritingMode(const CPDF_TextObject* pTextObj);\r
117     FX_INT32    FindTextlineFlowDirection();\r
118 protected:\r
119     CPDFText_ParseOptions                       m_ParseOptions;\r
120     CFX_WordArray                                       m_CharIndex;\r
121     const CPDF_PageObjects*                     m_pPage;\r
122     PAGECHAR_InfoArray                          m_charList;\r
123     CFX_WideTextBuf                                     m_TextBuf;\r
124     PAGECHAR_InfoArray                          m_TempCharList;\r
125     CFX_WideTextBuf                                     m_TempTextBuf;\r
126     int                                                         m_parserflag;\r
127     CPDF_TextObject*                            m_pPreTextObj;\r
128     CFX_AffineMatrix                            m_perMatrix;\r
129     FX_BOOL                                                     m_IsParsered;\r
130     CFX_AffineMatrix                            m_DisplayMatrix;\r
131 \r
132     SEGMENT_Array                                       m_Segment;\r
133     CFX_RectArray                                       m_SelRects;\r
134     LINEOBJ                                                     m_LineObj;\r
135     FX_BOOL                                                     m_TextlineDir;\r
136     CFX_FloatRect                                       m_CurlineRect;\r
137 };\r
138 class CPDF_TextPageFind: public IPDF_TextPageFind\r
139 {\r
140 public:\r
141     CPDF_TextPageFind(const IPDF_TextPage* pTextPage);\r
142     virtual                                                     ~CPDF_TextPageFind() {};\r
143 public:\r
144     virtual     FX_BOOL                                 FindFirst(CFX_WideString findwhat, int flags, int startPos = 0);\r
145     virtual     FX_BOOL                                 FindNext();\r
146     virtual     FX_BOOL                                 FindPrev();\r
147 \r
148     virtual void                                        GetRectArray(CFX_RectArray& rects) const;\r
149     virtual int                                         GetCurOrder() const;\r
150     virtual int                                         GetMatchedCount()const;\r
151 protected:\r
152     void                                                        ExtractFindWhat(CFX_WideString findwhat);\r
153     FX_BOOL                                                     IsMatchWholeWord(CFX_WideString csPageText, int startPos, int endPos);\r
154     FX_BOOL                                                     ExtractSubString(CFX_WideString& rString, FX_LPCWSTR lpszFullString,\r
155             int iSubString, FX_WCHAR chSep);\r
156     CFX_WideString                                      MakeReverse(const CFX_WideString str);\r
157     int                                                         ReverseFind(CFX_WideString csPageText, CFX_WideString csWord, int nStartPos, int& WordLength);\r
158     int                                                         GetCharIndex(int index) const;\r
159 private:\r
160     CFX_WordArray                                       m_CharIndex;\r
161     const IPDF_TextPage*                        m_pTextPage;\r
162     CFX_WideString                                      m_strText;\r
163     CFX_WideString                                      m_findWhat;\r
164     int                                                         m_flags;\r
165     CFX_WideStringArray                         m_csFindWhatArray;\r
166     int                                                         m_findNextStart;\r
167     int                                                         m_findPreStart;\r
168     FX_BOOL                                                     m_bMatchCase;\r
169     FX_BOOL                                                     m_bMatchWholeWord;\r
170     int                                                         m_resStart;\r
171     int                                                         m_resEnd;\r
172     CFX_RectArray                                       m_resArray;\r
173     FX_BOOL                                                     m_IsFind;\r
174 };\r
175 class CPDF_LinkExt: public CFX_Object\r
176 {\r
177 public:\r
178     CPDF_LinkExt() {};\r
179     int                                                         m_Start;\r
180     int                                                         m_Count;\r
181     CFX_WideString                                      m_strUrl;\r
182     virtual                                                     ~CPDF_LinkExt() {};\r
183 };\r
184 typedef CFX_ArrayTemplate<CPDF_LinkExt*> LINK_InfoArray;\r
185 class CPDF_LinkExtract: public IPDF_LinkExtract\r
186 {\r
187 public:\r
188     CPDF_LinkExtract();\r
189     virtual                                                     ~CPDF_LinkExtract();\r
190     virtual FX_BOOL                                     ExtractLinks(const IPDF_TextPage* pTextPage);\r
191     virtual     FX_BOOL                                 IsExtract() const\r
192     {\r
193         return m_IsParserd;\r
194     }\r
195 public:\r
196     virtual int                                         CountLinks() const;\r
197     virtual     CFX_WideString                  GetURL(int index) const;\r
198     virtual     void                                    GetBoundedSegment(int index, int& start, int& count) const;\r
199     virtual     void                                    GetRects(int index, CFX_RectArray& rects)const;\r
200 protected:\r
201     void                                                        parserLink();\r
202     void                                                        DeleteLinkList();\r
203     FX_BOOL                                                     CheckWebLink(CFX_WideString& strBeCheck);\r
204     FX_BOOL                                                     CheckMailLink(CFX_WideString& str);\r
205     FX_BOOL                                                     AppendToLinkList(int start, int count, CFX_WideString strUrl);\r
206 private:\r
207     LINK_InfoArray                                      m_LinkList;\r
208     const CPDF_TextPage*                        m_pTextPage;\r
209     CFX_WideString                                      m_strPageText;\r
210     FX_BOOL                                                     m_IsParserd;\r
211 };\r
212 FX_STRSIZE FX_Unicode_GetNormalization(FX_WCHAR wch, FX_LPWSTR pDst);\r
213 void NormalizeString(CFX_WideString& str);\r
214 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest);\r
215 #endif\r