Although I have come up with code that does what I want, I have found that for some RSS feeds it will fail by creating an DOM document wich is empty... this is a little test app I created to ilustrate the issue:
int main()
{
HRESULT hr = CoInitializeEx(NULL, COINIT_APARTMENTTHREADED);
if (hr != S_OK)
return 0;
bool encounteredError = false;
IXMLHTTPRequestPtr pXMLHTTPReq = NULL;
MSXML2::IXMLDOMDocumentPtr pXMLDocPtr = NULL;
MSXML2::IXMLDOMNodeListPtr pItemNodeList = NULL;
MSXML2::IXMLDOMElementPtr pItemElement = NULL;
try
{
// Create an XMLHTTPRequest object to request the feed
hr = pXMLHTTPReq.CreateInstance(__uuidof(MSXML2::XMLHTTP30));
if (FAILED(hr))
throw hr;
// open the request
hr = pXMLHTTPReq->open(_bstr_t(_T("GET")), _bstr_t(DILBERT_RSS_FEED), _variant_t(VARIANT_TRUE));
//hr = pXMLHTTPReq->open(_bstr_t(_T("GET")), _bstr_t(DILBERT_RSS_FEED), _variant_t(VARIANT_FALSE));
if (FAILED(hr))
throw hr;
// Set the headers
hr = pXMLHTTPReq->setRequestHeader(_bstr_t(_T("charset")), _bstr_t(_T("UTF-8")));
if (FAILED(hr))
throw hr;
// Send the request
hr = pXMLHTTPReq->send(NULL);
if (FAILED(hr))
throw hr;
long readyState = READYSTATE_UNINITIALIZED;
MSG msg;
while (readyState != READYSTATE_COMPLETE)
{
// Without this message pump, readyState does not change.
if (PeekMessage(&msg, 0, 0 ,0, PM_REMOVE))
{
TranslateMessage(&msg);
DispatchMessage(&msg);
}
readyState = pXMLHTTPReq->GetreadyState();
}
long nStatus = 0;
hr = pXMLHTTPReq->get_status(&nStatus);
if (FAILED(hr))
throw hr;
// Process the feed if the response was received successfully
if (nStatus == 200)
{
// Retrieve the RSS XML DOM Document to process the RSS
// Feed results and extract the comic strip's images info
// Retrieve the XML DOM Document from the response
BSTR bstrString = NULL;
hr = pXMLHTTPReq->get_responseText(&bstrString);
printf("Response Body:rn%Srn", bstrString);
hr = pXMLHTTPReq->get_responseXML((IDispatch **) &pXMLDocPtr);
if (FAILED(hr))
throw hr;
BSTR bstrXMLDoc = NULL;
pXMLDocPtr->get_text(&bstrXMLDoc);
printf("XML Response:rn%Srn", bstrXMLDoc);
// Retrieve the list of "item" elements
pItemNodeList = pXMLDocPtr->getElementsByTagName(_bstr_t(_T("item")));
if (FAILED(pItemNodeList))
throw hr;
//Here, if we're in error pDomNode is NULL
if (pItemNodeList != NULL)
{
long nItems = 0;
hr = pItemNodeList->get_length(&nItems);
if (FAILED(hr))
throw hr;
for (int i = 0; (i < (int)nItems) && (encounteredError == false); i++)
{
WCHAR rssTitle[TITLE_SIZE];
WCHAR rssLink[URL_SIZE];
WCHAR rssComicURL[URL_SIZE];
WCHAR rssComicFileName[MAX_PATH+1];
// Retrieve the ith item element
pItemElement = pItemNodeList->item[i];
if (pItemElement != NULL)
{
// Retrieve the title text
_tcscpy_s(rssTitle, pItemElement->firstChild->text);
// Retrieve the link element
MSXML2::IXMLDOMNodeListPtr pLinkNodes = NULL;
MSXML2::IXMLDOMElementPtr pLinkElement = NULL;
pLinkNodes = pItemElement->getElementsByTagName(_T("link"));
if (pLinkNodes != NULL)
{
long nLinkElements = 0;
hr = pLinkNodes->get_length(&nLinkElements);
if (FAILED(hr))
throw hr;
if (nLinkElements == 1)
{
pLinkElement = pLinkNodes->item[0];
// Retrieve the link
if (pLinkElement != NULL)
_tcscpy_s(rssLink, pLinkElement->text);
}
}
// Retrieve the description element
MSXML2::IXMLDOMNodeListPtr pSummaryNodes = NULL;
MSXML2::IXMLDOMElementPtr pSummaryElement = NULL;
pSummaryNodes = pItemElement->getElementsByTagName(_bstr_t(_T("description")));
if (pSummaryNodes != NULL)
{
long nSummaryElements = 0;
hr = pSummaryNodes->get_length(&nSummaryElements);
if (FAILED(hr))
throw hr;
if (nSummaryElements == 1)
{
pSummaryElement = pSummaryNodes->item[0];
// Retrieve the description
if (pSummaryElement != NULL)
{
LPSTR szRssSummary;
CHAR szRssComicURL[MAX_PATH+1];
UnicodeToAnsi(pSummaryElement->text, &szRssSummary);
// Retrieve the image URL
const regex imageurl("b(https?|ftp)://([-a-zA-Z0-9.]+)(/[-a-zA-Z0-9+&@#/%=~_|!:,.;]*)?(gif|png|jpg)");
cmatch matches;
if (regex_search(szRssSummary, matches, imageurl))
{
strcpy_s(szRssComicURL, matches[0].str().c_str());
#ifdef _UNICODE
LPWSTR pszRssComicURL;
AnsiToUnicode(szRssComicURL, &pszRssComicURL);
_tcscpy_s(rssComicURL, URL_SIZE, pszRssComicURL);
#else
_tcscpy_s(rssComicURL, URL_SIZE, matches[0].str().c_str());
#endif
}
else
{
_tcscpy_s(rssComicURL, URL_SIZE, _T("Not found"));
}
//const regex imageFileName("b(https?|ftp)://([-a-zA-Z0-9.]+)(/[-a-zA-Z0-9+&@#/%=~_|!:,.;]*)?(gif|png|jpg)");
const regex imageFileName("[w_.-]*?(?=?)|[w_.-]*$");
if (regex_search(szRssComicURL, matches, imageFileName))
{
#ifdef _UNICODE
LPWSTR pszImageFileName;
AnsiToUnicode(matches[0].str().c_str(), &pszImageFileName);
_tcscpy_s(rssComicFileName, MAX_PATH+1, pszImageFileName);
#else
_tcscpy_s(rssComicFileName, MAX_PATH+1, matches[0].str().c_str());
#endif
}
else
{
_tcscpy_s(rssComicFileName, MAX_PATH+1, _T("Not found"));
}
}
}
}
}
else
{
encounteredError = true;
}
}
encounteredError = false;
}
else
{
encounteredError = true;
}
}
else
{
encounteredError = true;
//cout << "Error selecting XML single node";
}
}
catch(_com_error &e)
{
bool encounteredError = true;
dump_com_error(e);
}
return 0;
} If I run it for http://feedproxy.google.com/DilbertDailyStrip it would retrieve & parse the feed without problems... but for http://www.stickfigurehamlet.com/stickfigurehamlet.rss the call to get_responseXML and then pXMLDocPtr->get_text(&bstrXMLDoc) will result in an empty string... although the call to pXMLHTTPReq->get_responseText(&bstrString) does retrive the response (XML document).
I can not spot the problem with my code... any ideas?








Sign In »
Register Now!
Help

MultiQuote