WLW Robot Examples

Example #1

Start URL

http://4business-werbeartikel.de/onlineshop.html

Robot

// steps.start always gets executed as the initial step on a start URL
steps.start = function() {

 // Use jQuery selector and .each() function to generate next steps for categories. This technique is common in many robots.
 $(".level2 a").each(function(i, v) {
 next(v.href, "drillCat");
 });

 // Must call done when finishing steps. Extension listens for done to start loading a new page.
 // done() is almost always necessary to finish a step. Exeption: when step causes a new page to load automatically. For example from a click.
 done();
};

steps.drillCat = function() {

 // Also use jQuery selector and .each() function to generate next steps. This time next steps will be products scraping.
 $(".item_list h5 a").each(function(i, v) {
 next(v.href, "scrapeProduct");
 });

 // Looking for next page links if there are multiple pages of products. ":first" in the selector ensures that we follow one link. Useful to include because next page usually duplicates at the top and the bottom of the page.
 $(".pagination a:contains(weiter):first").each(function(i, v) {
 // Notice here that we are calling the same step function drillCat. This is common across paginated products.
 next(v.href, "drillCat");
 });

 // Don't forget done()
 done();
};

steps.scrapeProduct = function() {

 // This step executes on a product page. Forming product data from various selectors.

 var item = {
 "Internal ID": $("tr:contains(Artikelnummer) td:last").text().trim(),
 "Categories": getCats(),
 "Name": $(".product-name").text().trim(),
 // short desc is the first paragraph
 "Short Description": $(".product-specs")
[0].innerText.trim().split("\n").shift(), "Images": getImgs(), "Long Description": $("#product-attribute-specs-table")[0].innerText.trim(), "Detail URL": document.URL, "Delivery Capacity": "", "Delivery Time": "", "Minimum Order Value": $("#qty").val().trim(), "Payment Option": "", "Price": $(".price-box .price:first").text().replace("€", "").trim(), "Price Comments": "", "Currency": "€" }; // Sending data to portal. "1262338" is a table name which coresponds to WLW company ID. // Send item variable as array, even if it is one product. // If there are many products on one page, they should be collected into an array and then sent to portal with a single emit. emit("1262338", [item]); done(); }; // helper function to parse Categories out of breadcrumbs. function getCats() { var cats = ""; $(".breadcrumbs a:not(:first)").each(function(i, v) { if (v.innerText !== "Onlineshop") { cats += v.innerText + ", "; } }); return cats.slice(0, -2); } // helper function to parse image URLs function getImgs() { var imgs = ""; $("#product_addtocart_form img").each(function(i, v) { imgs += v.src + ", "; }); return imgs.slice(0, -2); }

Example #2:

Start URL

http://www.3i-merchandising.de/start

Robot

steps.start = function() {

// In this robot we expect some product pages to fail. Chaging retry behavior.
// Robot will do retries every 30 seconds 3 times (on each step). We will allow 10000 retries for the whole run.
setRetries(60*1000,3, 10000);

// Drilling into categories using .each() with jQuery
$(‘.links_menue’).each(function(i, v){
next(v.href, “href”);
});

// don’t forget done()
done();
}

steps.href = function() {

// Tricky part. This shop displays products in an iframe. Which is like another window inside HTML.
// We take iframe’s URL and open it as the next step to jump into it.
var h = $(‘iframe’)[0].src
next(h, “jump”);
done();
}

steps.jump = function() {

// Got the iframe URL open and it redirects. Rare example when done() is not needed in a step.
// done() is triggered for us automatically by a redirect.
next(“”, “drillCat”);
}

steps.drillCat = function() {

// As usual generate next steps into each product
$(“div#foto a”).each(function(i,v) {
next(v.href, “scrapeProduct”);
});

// Tricky part. Here pagination still shows next page link even when we are on the last listing page.
// We parse text of how many out of how many products are displayed to know if we reached the last page.
var results = $(“.foundtxt:first”).text().match(/\d+/g); // extract digits
var total = results.shift();
var last = results.pop();

if(total !== last) {

// Notice next has a blank URL. This is to indicate that robot does not have to load a page. See comment below why.
next(“”, “drillCat”);

// Robot will click on next page link and it will load a new page. This will also trigger done() for us.
$(“#navigatieVerder”).click();
} else {

// Do done() only on the last page.
done();
}

};

steps.scrapeProduct = function() {

// scraping product date as usual.
// one tricky part is that in the beginning we check some element that is present on products. This is to check that we really have a product page.

if ($(“div.titeldetailstop”).length > 0) {
var item = {
“Internal ID” : $(“div.titeldetails”)[0].innerText.split(“Artikelnr. “).pop().split(“\n”)[0],
“Categories” : $(“#zoekinfotxt b”).text().slice(1,-1).trim(),
“Name” : $(“.titeldetailstop”).text().trim().split(“\n”)[0].trim(),
“Short Description” : cutDesc($(“#productdetailsdescription”).text().trim()),
“Images” : $(“.imgContainerdetails img”)[0].src,
“Long Description” : $(“#productdetailsdescription”).text().trim(),
“Detail URL” : document.URL,
“Delivery Capacity” : “”,
“Delivery Time” : “”,
“Minimum Order Value” : “”,
“Payment Option” : “”,
“Price” : $(“div.prijzen div.prijs”).text().trim().replace(“€ “, “”),
“Price Comments” : $(“div.voettxt”).text().trim(),
“Currency” : “€”
};

if ($(“div.titeldetails”)[0].innerText.indexOf(“Lagerbestand: “) !== -1) {
item[“Delivery Capacity”] = $(“div.titeldetails”)[0].innerText.split(“Lagerbestand: “).pop();
}

emit(“1607158”, [item]);
}

done();
};

// helper function. Builds short description out of long description.
// Short desc is the first 200 characters form Long description.
cutDesc = function(desc) {
if( desc.length > 200) {
desc = desc.substring(0, 197) + “…”;
}
return desc;
}