Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

loading data file with 3 spaces as delimiter using Sparks csv reader in java

I have a data file of numeric values that I am trying to read in, the data looks like:

   1   6   4  12   5   5   3   4   1  67   3   2   1   2   1   0   0   1   0   0   1   0   0   1   1 
   2  48   2  60   1   3   2   2   1  22   3   1   1   1   1   0   0   1   0   0   1   0   0   1   2 

It is delimited by 3 spaces. I want to have this in a Spark DataFrame.

I'm struggling to parse this, it seems to read each line as one big string.

I've tired the following;

Dataset<Row> df = spark.read().format("com.databricks.spark.csv")
            .option("header", "false")
            .option("delimter", "\t")
            .load(csvFile);
    df.show(5);

also:

.option("delimter", "   ") // leads to java error that Delimter cant take more than one character

also tired .option("sep", "\t") instead of "delimter":

Here is my full code:

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

public class CreditRiskML {

static SparkSession spark = SparkSession.builder()
        .appName("Credit Risk ML")
        .master("local[*]")
        .config("spark.sql.warehouse.dir", "E:/Exp/")
        .getOrCreate();

public static double parseDouble(String str){
    return Double.parseDouble(str);
}



public static void main(String[] args){

    String csvFile = "input\\credit.data";
    Dataset<Row> df = spark.read().format("com.databricks.spark.csv")
            .option("header", "false")
            .option("delimter", "\t")
            .option("sep", "\t")
            .load(csvFile);
    df.show(5);

    //create RDD of type Credit

    JavaRDD<Credit> creditRdd = df.toJavaRDD().map(new Function<Row, Credit>() {
        @Override
        public Credit call(Row r) throws Exception {
            return new Credit(parseDouble(r.getString(0)), parseDouble(r.getString(1)) - 1,
                    parseDouble(r.getString(2)), parseDouble(r.getString(3)), parseDouble(r.getString(4)),
                    parseDouble(r.getString(5)), parseDouble(r.getString(6)) - 1, parseDouble(r.getString(7)) - 1,
                    parseDouble(r.getString(8)), parseDouble(r.getString(9)) - 1, parseDouble(r.getString(10)) - 1,
                    parseDouble(r.getString(11)) - 1, parseDouble(r.getString(12)) - 1,
                    parseDouble(r.getString(13)), parseDouble(r.getString(14)) - 1,
                    parseDouble(r.getString(15)) - 1, parseDouble(r.getString(16)) - 1,
                    parseDouble(r.getString(17)) - 1, parseDouble(r.getString(18)) - 1,
                    parseDouble(r.getString(19)) - 1, parseDouble(r.getString(20)) - 1);
        }
    });

    //Create a dataset of type Row from the RDD of type Credit
    Dataset<Row> creditData = spark.sqlContext().createDataFrame(creditRdd, Credit.class);

    creditData.show(5);

 }
}

error message:

java.lang.NumberFormatException: For input string: "1   6   4  12   5   5   3   4   1  67   3   2   1   2   1   0   0   1   0   0   1   0   0   1   1"

What is the best way to get around this ? Any help is much appreciated.

P.s here is the credit class:

public class Credit {
    private double creditability;
    private double balance;
    private double duration;
    private double history;
    private double purpose;
    private double amount;
    private double savings;
    private double employment;
    private double instPercent;
    private double sexMarried;
    private double guarantors;
    private double residenceDuration;
    private double assets;
    private double age;
    private double concCredit;
    private double apartment;
    private double credits;
    private double occupation;
    private double dependents;
    private double hasPhone;
    private double foreign;

    public Credit(double creditability, double balance, double duration, double history, double purpose, double amount,
                  double savings, double employment, double instPercent, double sexMarried, double guarantors,
                  double residenceDuration, double assets, double age, double concCredit, double apartment, double credits,
                  double occupation, double dependents, double hasPhone, double foreign) {
        super();
        this.creditability = creditability;
        this.balance = balance;
        this.duration = duration;
        this.history = history;
        this.purpose = purpose;
        this.amount = amount;
        this.savings = savings;
        this.employment = employment;
        this.instPercent = instPercent;
        this.sexMarried = sexMarried;
        this.guarantors = guarantors;
        this.residenceDuration = residenceDuration;
        this.assets = assets;
        this.age = age;
        this.concCredit = concCredit;
        this.apartment = apartment;
        this.credits = credits;
        this.occupation = occupation;
        this.dependents = dependents;
        this.hasPhone = hasPhone;
        this.foreign = foreign;
    }

    public double getCreditability() {
        return creditability;
    }

    public void setCreditability(double creditability) {
        this.creditability = creditability;
    }

    public double getBalance() {
        return balance;
    }

    public void setBalance(double balance) {
        this.balance = balance;
    }

    public double getDuration() {
        return duration;
    }

    public void setDuration(double duration) {
        this.duration = duration;
    }

    public double getHistory() {
        return history;
    }

    public void setHistory(double history) {
        this.history = history;
    }

    public double getPurpose() {
        return purpose;
    }

    public void setPurpose(double purpose) {
        this.purpose = purpose;
    }

    public double getAmount() {
        return amount;
    }

    public void setAmount(double amount) {
        this.amount = amount;
    }

    public double getSavings() {
        return savings;
    }

    public void setSavings(double savings) {
        this.savings = savings;
    }

    public double getEmployment() {
        return employment;
    }

    public void setEmployment(double employment) {
        this.employment = employment;
    }

    public double getInstPercent() {
        return instPercent;
    }

    public void setInstPercent(double instPercent) {
        this.instPercent = instPercent;
    }

    public double getSexMarried() {
        return sexMarried;
    }

    public void setSexMarried(double sexMarried) {
        this.sexMarried = sexMarried;
    }

    public double getGuarantors() {
        return guarantors;
    }

    public void setGuarantors(double guarantors) {
        this.guarantors = guarantors;
    }

    public double getResidenceDuration() {
        return residenceDuration;
    }

    public void setResidenceDuration(double residenceDuration) {
        this.residenceDuration = residenceDuration;
    }

    public double getAssets() {
        return assets;
    }

    public void setAssets(double assets) {
        this.assets = assets;
    }

    public double getAge() {
        return age;
    }

    public void setAge(double age) {
        this.age = age;
    }

    public double getConcCredit() {
        return concCredit;
    }

    public void setConcCredit(double concCredit) {
        this.concCredit = concCredit;
    }

    public double getApartment() {
        return apartment;
    }

    public void setApartment(double apartment) {
        this.apartment = apartment;
    }

    public double getCredits() {
        return credits;
    }

    public void setCredits(double credits) {
        this.credits = credits;
    }

    public double getOccupation() {
        return occupation;
    }

    public void setOccupation(double occupation) {
        this.occupation = occupation;
    }

    public double getDependents() {
        return dependents;
    }

    public void setDependents(double dependents) {
        this.dependents = dependents;
    }

    public double getHasPhone() {
        return hasPhone;
    }

    public void setHasPhone(double hasPhone) {
        this.hasPhone = hasPhone;
    }

    public double getForeign() {
        return foreign;
    }

    public void setForeign(double foreign) {
        this.foreign = foreign;
    }
}
like image 936
ukbaz Avatar asked Dec 03 '25 04:12

ukbaz


1 Answers

One method of getting around this is to use java.util.Scanner. Because you're using whitespaces, you do not need to specify a delimiter.

String s = "1   0   2   0";
Scanner scanner = new Scanner(s);

while(scanner.hasNext()){
  System.out.println(scanner.next());
}

The output will be:

1
0
2
0

This will work regardless of the amount of whitespaces in the given String.

like image 148
DCON Avatar answered Dec 05 '25 18:12

DCON



Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!