1. Setup tools

Install Chrome browser.  Download from here.

Install Web Robots Chrome extension.  Get it here

Sign-up to our portal portal.webrobots.io

 

2. Create new robot on portal.

 Click ‘Robots‘ in top menu

Click ‘New robot‘ at the bottom of robots list

Chose any name and start url.

In this example we’ll use demo1 for named use https://stackoverflow.com/questions as start url

 

3.  Hello world robot

 Open Web Robots extension and select you new robot from drop down

Enter code:

steps.start = function() {
 dbg('Hello, world!');
 done();
};

 

Click “Run” button.

Wait until status message says robot is finished;

 Run finished in: 1.075: demo_2015-01-09T09_25_44_727ZDone: 2 Que: 0 EmitQue: 0

 See

[Log] tab to view log messages

 

4.  Save data to server

Modify code to be:

steps.start = function() {
    dbg('Hello, world!');
    
    var rows = [];
    var row = {};
    
    row.price = 1;
    row.address = '100 main street';
    row.city =  'New York';
    row.rooms = 5;
    row.area = 50;
    rows.push(row);
    
    emit ('listings', rows);
    done();
};

Run robot and check the [Output] tab.  It shows data we just published:

 
 [
 {
 'price': 1,
 'address': '100 main street',
 'city': 'New York',
 'rooms': 5,
 'area': 50,
 'source_url': 'http://newyork.craigslist.org/'
 }
]

5.  Extract some data from page

Modify the robot code as shown below. This robot will extract every question from a page. Ensure that robot’s start URL is set to https://stackoverflow.com/questions for this part.

 steps.start = function() { 
    let rows = [];
    
    $('.s-post-summary').each(function (i, v) {
        let row = {};
        row.title = $('.s-post-summary--content-title', v).text().trim();
        row.url = $('.s-post-summary--content-title a', v).prop('href');
        row.votes = $('.s-post-summary--stats-item:contains("vote") .s-post-summary--stats-item-number', v).text().trim();
        row.answers = $('.s-post-summary--stats-item:contains("answer") .s-post-summary--stats-item-number', v).text().trim();
        row.views = $('.s-post-summary--stats-item:contains("view") .s-post-summary--stats-item-number', v).text().trim();
        row.excerpt = $('.s-post-summary--content-excerpt', v).text().trim();
        row.tags = $('.s-post-summary--meta-tags a', v).map(function (i, v) { return $(v).text().trim() }).get();
        row.time_ago_asked = $('.relativetime', v).text().trim(); 
        row.user = {
            username: $('.s-user-card--link a', v).text().trim(),
            reputation: $('.s-user-card--rep', v).text().trim(), 
        };
    
        rows.push(row);
    });

    if(rows.length) emit('questions', rows);
    done();
};

After executing the robot check that data appeared on portal. Find your robot in portal, open it and see that there is a new run with your data.

6. Iterate over multiple pages

Add code to click ‘Next’ link on each page.

 steps.start = function() { 
    let rows = [];
    
    $('.s-post-summary').each(function (i, v) {
        let row = {};
        row.title = $('.s-post-summary--content-title', v).text().trim();
        row.url = $('.s-post-summary--content-title a', v).prop('href');
        row.votes = $('.s-post-summary--stats-item:contains("vote") .s-post-summary--stats-item-number', v).text().trim();
        row.answers = $('.s-post-summary--stats-item:contains("answer") .s-post-summary--stats-item-number', v).text().trim();
        row.views = $('.s-post-summary--stats-item:contains("view") .s-post-summary--stats-item-number', v).text().trim();
        row.excerpt = $('.s-post-summary--content-excerpt', v).text().trim();
        row.tags = $('.s-post-summary--meta-tags a', v).map(function (i, v) { return $(v).text().trim() }).get();
        row.time_ago_asked = $('.relativetime', v).text().trim(); 
        row.user = {
            username: $('.s-user-card--link a', v).text().trim(),
            reputation: $('.s-user-card--rep', v).text().trim(), 
        };
    
        rows.push(row);
    });
 
    if($('.s-pagination--item[rel="next"]').length > 0) {
        next($('.s-pagination--item[rel="next"]').prop('href'), 'start');
    } 

    if(rows.length) emit('questions', rows);
    done();
};

Refresh portal page and you will see a new run with data extracted from all pages.