WLW Robot Examples
Example #1
Start URL
http://4business-werbeartikel.de/onlineshop.html
Robot
// steps.start always gets executed as the initial step on a start URL steps.start = function() { // Use jQuery selector and .each() function to generate next steps for categories. This technique is common in many robots. $(".level2 a").each(function(i, v) { next(v.href, "drillCat"); }); // Must call done when finishing steps. Extension listens for done to start loading a new page. // done() is almost always necessary to finish a step. Exeption: when step causes a new page to load automatically. For example from a click. done(); }; steps.drillCat = function() { // Also use jQuery selector and .each() function to generate next steps. This time next steps will be products scraping. $(".item_list h5 a").each(function(i, v) { next(v.href, "scrapeProduct"); }); // Looking for next page links if there are multiple pages of products. ":first" in the selector ensures that we follow one link. Useful to include because next page usually duplicates at the top and the bottom of the page. $(".pagination a:contains(weiter):first").each(function(i, v) { // Notice here that we are calling the same step function drillCat. This is common across paginated products. next(v.href, "drillCat"); }); // Don't forget done() done(); }; steps.scrapeProduct = function() { // This step executes on a product page. Forming product data from various selectors. var item = { "Internal ID": $("tr:contains(Artikelnummer) td:last").text().trim(), "Categories": getCats(), "Name": $(".product-name").text().trim(), // short desc is the first paragraph "Short Description": $(".product-specs")[0].innerText.trim().split("\n").shift(), "Images": getImgs(), "Long Description": $("#product-attribute-specs-table")[0].innerText.trim(), "Detail URL": document.URL, "Delivery Capacity": "", "Delivery Time": "", "Minimum Order Value": $("#qty").val().trim(), "Payment Option": "", "Price": $(".price-box .price:first").text().replace("€", "").trim(), "Price Comments": "", "Currency": "€" }; // Sending data to portal. "1262338" is a table name which coresponds to WLW company ID. // Send item variable as array, even if it is one product. // If there are many products on one page, they should be collected into an array and then sent to portal with a single emit. emit("1262338", [item]); done(); }; // helper function to parse Categories out of breadcrumbs. function getCats() { var cats = ""; $(".breadcrumbs a:not(:first)").each(function(i, v) { if (v.innerText !== "Onlineshop") { cats += v.innerText + ", "; } }); return cats.slice(0, -2); } // helper function to parse image URLs function getImgs() { var imgs = ""; $("#product_addtocart_form img").each(function(i, v) { imgs += v.src + ", "; }); return imgs.slice(0, -2); }Example #2:
Start URL
http://www.3i-merchandising.de/start
Robot
steps.start = function() {
// In this robot we expect some product pages to fail. Chaging retry behavior.
// Robot will do retries every 30 seconds 3 times (on each step). We will allow 10000 retries for the whole run.
setRetries(60*1000,3, 10000);// Drilling into categories using .each() with jQuery
$(‘.links_menue’).each(function(i, v){
next(v.href, “href”);
});// don’t forget done()
done();
}steps.href = function() {
// Tricky part. This shop displays products in an iframe. Which is like another window inside HTML.
// We take iframe’s URL and open it as the next step to jump into it.
var h = $(‘iframe’)[0].src
next(h, “jump”);
done();
}steps.jump = function() {
// Got the iframe URL open and it redirects. Rare example when done() is not needed in a step.
// done() is triggered for us automatically by a redirect.
next(“”, “drillCat”);
}steps.drillCat = function() {
// As usual generate next steps into each product
$(“div#foto a”).each(function(i,v) {
next(v.href, “scrapeProduct”);
});// Tricky part. Here pagination still shows next page link even when we are on the last listing page.
// We parse text of how many out of how many products are displayed to know if we reached the last page.
var results = $(“.foundtxt:first”).text().match(/\d+/g); // extract digits
var total = results.shift();
var last = results.pop();if(total !== last) {
// Notice next has a blank URL. This is to indicate that robot does not have to load a page. See comment below why.
next(“”, “drillCat”);// Robot will click on next page link and it will load a new page. This will also trigger done() for us.
$(“#navigatieVerder”).click();
} else {// Do done() only on the last page.
done();
}};
steps.scrapeProduct = function() {
// scraping product date as usual.
// one tricky part is that in the beginning we check some element that is present on products. This is to check that we really have a product page.if ($(“div.titeldetailstop”).length > 0) {
var item = {
“Internal ID” : $(“div.titeldetails”)[0].innerText.split(“Artikelnr. “).pop().split(“\n”)[0],
“Categories” : $(“#zoekinfotxt b”).text().slice(1,-1).trim(),
“Name” : $(“.titeldetailstop”).text().trim().split(“\n”)[0].trim(),
“Short Description” : cutDesc($(“#productdetailsdescription”).text().trim()),
“Images” : $(“.imgContainerdetails img”)[0].src,
“Long Description” : $(“#productdetailsdescription”).text().trim(),
“Detail URL” : document.URL,
“Delivery Capacity” : “”,
“Delivery Time” : “”,
“Minimum Order Value” : “”,
“Payment Option” : “”,
“Price” : $(“div.prijzen div.prijs”).text().trim().replace(“€ “, “”),
“Price Comments” : $(“div.voettxt”).text().trim(),
“Currency” : “€”
};if ($(“div.titeldetails”)[0].innerText.indexOf(“Lagerbestand: “) !== -1) {
item[“Delivery Capacity”] = $(“div.titeldetails”)[0].innerText.split(“Lagerbestand: “).pop();
}emit(“1607158”, [item]);
}done();
};// helper function. Builds short description out of long description.
// Short desc is the first 200 characters form Long description.
cutDesc = function(desc) {
if( desc.length > 200) {
desc = desc.substring(0, 197) + “…”;
}
return desc;
}