C language > Work in progress
Word .doc file, extract text
TimoVJL:
A simple example just for extracting text portion from Word .doc file, no formatting or other processing.
bitcoin:
It's perfect! Thank you very much! This is very hard code - i don't see any doc files parser before.
TimoVJL:
Not so usefull code, don't even handle piece table :(
Links:
Office Binary (doc, xls, ppt) Translator to Open XML
EDIT Test program using some OLE functions:
--- Code: ---#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <ole2.h>
#pragma comment(lib, "ole32.lib")
int __cdecl main(int argc, char **argv)
{
LPSTORAGE lpStorage;
BYTE szTmp[512];
MultiByteToWideChar(CP_OEMCP, 0, argv[1], -1, (WCHAR*)szTmp, 260);
SCODE sc = StgOpenStorage((WCHAR*)szTmp, NULL, STGM_READ | STGM_SHARE_EXCLUSIVE, 0, 0, &lpStorage);
if (sc == NOERROR)
{
LPSTREAM lpStream;
sc = lpStorage->lpVtbl->OpenStream(lpStorage, L"WordDocument", NULL, STGM_READ | STGM_SHARE_EXCLUSIVE, 0, (void*)&lpStream);
if (lpStream) {
STATSTG statsg;
DWORD nRead;
puts("WordDocument");
lpStream->lpVtbl->Stat(lpStream, &statsg, STATFLAG_NONAME);
LARGE_INTEGER li = {0};
lpStream->lpVtbl->Seek(lpStream, li, STREAM_SEEK_SET, NULL);
lpStream->lpVtbl->Read(lpStream, &szTmp, 32, &nRead);
if (*(WORD*)szTmp == 0xA5EC || *(WORD*)szTmp == 0xA5DC) { // Word.8 Word.6
DWORD nTxOfs1 = *(DWORD*)(szTmp+0x18);
DWORD nTxOfs2 = *(DWORD*)(szTmp+0x1C);
printf("text starts: %Xh\n", *(DWORD*)(szTmp+0x18));
printf("text ends: %Xh\n", *(DWORD*)(szTmp+0x1C));
strcpy(szTmp, argv[1]);
strcat(szTmp, ".txt");
HANDLE hFileTxt = CreateFile(szTmp, GENERIC_WRITE, 0, NULL,
CREATE_ALWAYS ,FILE_FLAG_SEQUENTIAL_SCAN, NULL);
DWORD nSize = statsg.cbSize.u.LowPart;
DWORD nWrite;
li.u.LowPart = nTxOfs1; // start of text (incremental saving)
lpStream->lpVtbl->Seek(lpStream, li, STREAM_SEEK_SET, NULL);
nSize = nTxOfs2 - nTxOfs1; // saving area
while (nSize) {
nRead = nSize > 512 ? 512 : nSize;
lpStream->lpVtbl->Read(lpStream, &szTmp, nRead, &nRead);
WriteFile(hFileTxt, szTmp, nRead, &nWrite, NULL);
nSize -= nRead;
}
CloseHandle(hFileTxt);
}
lpStream->lpVtbl->Release(lpStream);
}
lpStorage->lpVtbl->Release(lpStorage);
}
return 0;
}
--- End code ---
bitcoin:
TimoVJL
I am often need to find some word in a lot of documents. Your code, I think, will be very useful.
jj2007:
--- Quote from: TimoVJL on March 26, 2019, 06:28:05 PM ---EDIT Test program using some OLE functions
--- End quote ---
Works like a charm, Timo :)
I have MS Word installed; would it work without?
Navigation
[0] Message Index
[#] Next page
Go to full version