Reduce Side Join in Map Reduce

Reduce Side Joins

Problem :   There are  two files ,   one contain City To Airlines mapping , other has  Country to City Mapping .  The job is expected to output Country to Airlines  mapping . (Github)

  1.  Country can have many cities
  2. City can have multiple airlines

 

dependency

<dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.6.0-cdh5.9.0<version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.6.0-cdh5.9.0<version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.6.0-cdh5.9.0<version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>2.6.0-cdh5.9.0</version>
        </dependency>
   </dependencies>

   <repositories>
        <repository>
            <id>cloudera</id>
            <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
        </repository>
   </repositories> 

[addToAppearHere]

Mapper1 :   Reads the input for City To Airlines

 

package com.big.data.mapreduce.join;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class CityToAirlinesMapper extends Mapper<LongWritable, Text, Text, Text> {
    // As TextInput Format has been used , 
    // the key is the offset of line in the file , The actual line goes in the value

    public static final String AIRLINE_DELIMTER = "AL_";
    private Text city;
    private Text airlines;

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        super.setup(context);

        city = new Text();
        airlines = new Text();

    }

    // Input city , airlineName
    @Override
    public void map(LongWritable key, Text value,
                    Context context) throws IOException, InterruptedException {

        String[] cityToAirlines = value.toString().split(CountryToAirlineDriver.DELIMTER);

        city.set(cityToAirlines[0]);
        // Delimter added to the value to distinguish from country in the reducer 
        airlines.set(AIRLINE_DELIMTER + cityToAirlines[1]);
        // City is the key , Airlines is the value 
        context.write(city, airlines);
    }

}
[addToAppearHere]

Mapper2 : Reads the input for Country To City mapping

 

package com.big.data.mapreduce.join;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class CountryToCityMapper extends Mapper<LongWritable, Text, Text, Text> {
    // As TextInput Format has been used , 
    // the key is the offset of line in the file , The actual line goes in the value

    public static final String COUNTRY_DELIMTER = "CO_";
    private Text country;
    private Text city;

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        super.setup(context);

        country = new Text();
        city = new Text();

    }

    // Input country , city
    @Override
    public void map(LongWritable key, Text value,
                    Context context) throws IOException, InterruptedException {

        String[] countryToCity = value.toString().split(CountryToAirlineDriver.DELIMTER);

        city.set(countryToCity[1]);
        // Delimter added to the value to distingusih it from airlines  in the reducer 
        country.set(COUNTRY_DELIMTER + countryToCity[0]);

        //city is being made the key , country as the value.
        context.write(city, country);
    }

}

Reducer :
City is the key ,  Country is value from one mapper , airlines is the value from other mapper

package com.big.data.mapreduce.join;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.HashSet;
import java.util.Set;

public class JoinReducer extends Reducer<Text, Text, Text, Text> {

    private Text countryOutput;
    private Text airlineOutput;

    private Set<String> countrySet;
    private Set<String> airlineSet;
    String airlineOrCountry;

    @Override
    protected void setup(Context context)
                            throws IOException, InterruptedException {
        super.setup(context);

        countryOutput = new Text();
        airlineOutput = new Text();

        countrySet = new HashSet<>();
        airlineSet = new HashSet<>();

    }

    public void clear() {
        countrySet.clear();
        airlineSet.clear();
        airlineOrCountry = null;
    }

    @Override
    public void reduce(Text key, Iterable<Text> values, Context context)
                                           throws IOException, InterruptedException {
        //In reduce -> for each city  the Iterable wil 
        //be list of all country + and airlines . 
        //Only way to distinguish between country and airlines are the delimter  
        // Assumed a city belongs to one country

        // Clear the sets before processing each key
        clear();

        for (Text val : values) {
            airlineOrCountry = val.toString();
            if (airlineOrCountry.startsWith
                                 (CityToAirlinesMapper.AIRLINE_DELIMTER)) {
                // remove the delimeter added in the mapper
                airlineSet.add(airlineOrCountry
                          .split(CityToAirlinesMapper.AIRLINE_DELIMTER)[1]);

            } else if (airlineOrCountry.startsWith(CountryToCityMapper.COUNTRY_DELIMTER)) {

                // remove the delimeter added in the mapper
                countrySet.add(airlineOrCountry
                          .split(CountryToCityMapper.COUNTRY_DELIMTER)[1]);
            } else {
                // Neither its a country or a Airline
                // do not write any output
                return;
            }
        }
         // Depending on the logic of output we can have Left/Right Outer, Inner Join 
        // Full outer join output 
        for (String country : countrySet) {
            countryOutput.set(country);
            for (String airline : airlineSet) {
                airlineOutput.set(airline);
                context.write(countryOutput, airlineOutput);
            }
        }

    }
}

[addToAppearHere]

Key Take Aways:
1. Joins means for a given key getting all the values .
2. In Reduce side joins data is shuffles from all the mappers to a reducer
3. For a given key all the values are available on a given reducer
4. All values for a given key will be collected at a given reducer, But a reducer can collect more than one key
5. For a given key, all values arrive at a given reducer, to simulate a full outer Join , Inner Join Left outer , right outer depends on how one writes the logic for output from reducer.

 

Driver:

 

package com.big.data.mapreduce.join;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

/**
 * country ->city , city ->airline task is to find country->airline .
 * In the process learn about join and Output Full outer join 
 * as output and hence understand how to implememt rightouter , leftouterjoins
 */
public class CountryToAirlineDriver extends Configured implements Tool {
    //extends Configured implements Tool helps in argument parsing . 
    //Arguments need to passed as -Dkey=Value

    public static final String INPUT_PATH_LEFT = "input.path.left";
    public static final String INPUT_PATH_RIGHT = "input.path.right";
    public static final String OUTPUT_PATH = "output.path";
    public static final String DELIMTER = ",";

    public static void main(String[] args) throws Exception {
        if (ToolRunner.run(new CountryToAirlineDriver(), args) != 0) {
            throw new IOException("Job has failed");
        }
    }

    @Override
    public int run(String[] args) throws Exception {

        //The arguments passed has been split into Key value by ToolRunner
        Configuration conf = getConf();
        Path inputPathLeft = new Path(conf.get(INPUT_PATH_LEFT));
        Path inputPathRight = new Path(conf.get(INPUT_PATH_RIGHT));
        Path outputPath = new Path(conf.get(OUTPUT_PATH));
        Job job = new Job(conf, this.getClass().toString());

        // For left path set CountryToCityMapper , 
        //For right path set CityToAirlineMapper , 
        //as the schema are different hence different mapper
        MultipleInputs.addInputPath(job, inputPathLeft, 
                                         TextInputFormat.class, CountryToCityMapper.class);
        MultipleInputs.addInputPath(job, inputPathRight, 
                                         TextInputFormat.class, CityToAirlinesMapper.class);

        //This is the base Path for the sub directory, the extra path will be added in the mapper .
        FileOutputFormat.setOutputPath(job, outputPath);

        job.setJobName("CountryToAirlineDriver");
        job.setJarByClass(CountryToAirlineDriver.class);

        //Set OutPutFormat class
        job.setOutputFormatClass(TextOutputFormat.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        //As no reducers are used, its a map only task
       job.setReducerClass(JoinReducer.class);

        // Driver polls to find out if the job has completed or not.
        return job.waitForCompletion(true) ? 0 : 1;
    }
}
[addToAppearHere]

Integration Test (Github)

 

package com.big.data.mapreduce.join;

import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.UUID;


public class CountryToAirlineDriverTest {

    private final Configuration conf = new Configuration();
    private static FileSystem fs;
    private static String baseDir;
    private static String outputDir;
    private static String leftdir;
    private static String rightdir;
    private static final String NEW_LINE_DELIMETER = "\n";
    private static Map<String, Set<String>> countryToAirline;

    @BeforeClass
    public static void startup() throws Exception {

        Configuration conf = new Configuration();
        //set the fs to file:/// which means the local fileSystem
        conf.set("fs.default.name", "file:///");
        conf.set("mapred.job.tracker", "local");
        fs = FileSystem.getLocal(conf);
        baseDir = "/tmp/mapreduce/join/" + UUID.randomUUID().toString() + "/";

        leftdir = baseDir + "left";
        rightdir = baseDir + "right";

        outputDir = baseDir + "/output/";

        //Write the data into the local filesystem  for Left input
        File tempFileleft = new File(leftdir + "/input.txt");
        FileUtils.writeStringToFile(tempFileleft, "Germany,Berlin", "UTF-8");
        FileUtils.writeStringToFile(tempFileleft, NEW_LINE_DELIMETER, "UTF-8", true);
        FileUtils.writeStringToFile(tempFileleft, "India,Delhi", "UTF-8", true);

        //Write the data into the local filesystem  for right input
        File tempFileRight = new File(rightdir + "/input.txt");
        FileUtils.writeStringToFile(tempFileRight, "Berlin,Tegel", "UTF-8");
        FileUtils.writeStringToFile(tempFileRight, NEW_LINE_DELIMETER, "UTF-8", true);
        FileUtils.writeStringToFile(tempFileRight, "Berlin,Schonfield", "UTF-8", true);
        FileUtils.writeStringToFile(tempFileRight, NEW_LINE_DELIMETER, "UTF-8", true);
        FileUtils.writeStringToFile(tempFileRight, "Delhi,IGI", "UTF-8", true);

        countryToAirline = new HashMap<>();
    }

    @AfterClass
    public static void cleanup() throws Exception {
        //Delete the local filesystem folder after the Job is done
        fs.delete(new Path(baseDir), true);
    }

    void fileToHashMap(String filePath) throws IOException {

        //Read the data from the outputfile
        File outputFile = new File(filePath);
        String fileToString = FileUtils.readFileToString(outputFile, "UTF-8");

        //4 lines in output file, with one word per line
        Arrays.stream(fileToString.split(NEW_LINE_DELIMETER)).forEach(e -> {
            String[] countryToAirlineArray = e.split("\t");
            Set<String> airline = null;

            if (countryToAirline.get(countryToAirlineArray[0]) == null) {
                airline = new HashSet<String>();
                airline.add(countryToAirlineArray[1]);
                countryToAirline.put(countryToAirlineArray[0], airline);

            } else {
                airline = countryToAirline.get(countryToAirlineArray[0]);
                airline.add(countryToAirlineArray[1]);
            }
        });

    }

    @Test
    public void countryToAirlineTest() throws Exception {

        // Any argument passed with -DKey=Value will be parsed by ToolRunner
        String[] args = new String[]{
             "-D" + CountryToAirlineDriver.INPUT_PATH_LEFT + "=" + leftdir,
             "-D" + CountryToAirlineDriver.INPUT_PATH_RIGHT+ "=" + rightdir,
             "-D" + CountryToAirlineDriver.OUTPUT_PATH + "=" + outputDir};
        // call the main function to run the job
        CountryToAirlineDriver.main(args);

        fileToHashMap(outputDir + "/part-r-00000");

        //4 words .
        Assert.assertEquals(2L, countryToAirline.size());
        Assert.assertEquals(2L, countryToAirline.get("Germany").size());
        Assert.assertTrue(countryToAirline.get("Germany").contains("Tegel"));
        Assert.assertTrue(countryToAirline.get("Germany").contains("Schonfield"));

    }

}