lunedì 23 maggio 2011

From Freedb.org to OrientDB - #4

#1 - #2 - #3

Download a big file about 170 Mb.
Uncompress  ... about 614 Mb and 2630481 file...

to speed up the importation are processed only 1000 files per folder.

sample porting schema from FreeDB to OrientDB:



import code




import com.orientechnologies.orient.core.db.document.ODatabaseDocumentTx;
import com.orientechnologies.orient.core.metadata.schema.OProperty.INDEX_TYPE;
import com.orientechnologies.orient.core.metadata.schema.OType;
import com.orientechnologies.orient.core.record.impl.ODocument;
import com.orientechnologies.orient.core.sql.query.OSQLSynchQuery;
import com.orientechnologies.orient.server.OServer;
import com.orientechnologies.orient.server.OServerMain;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import org.apache.commons.io.FileUtils;

private String base = "/Users/marco/orientdb/";
private String root_dir = base + "/file_freedb/freedb-complete-20090101/";
private HashMap cache_year = new HashMap();
private HashMap cache_genre = new HashMap();

  private void import_data() {

        try {

            OServer server = OServerMain.create();
            server.startup(new File(base + "/file/conf.xml"));

            ODatabaseDocumentTx db = new ODatabaseDocumentTx("local:" + base + "/freedb");
            if (!db.exists()) {
                db.create();
                System.out.println("create new DB");
            } else {
                db.delete();
                db.create();
                System.out.println("delete and create new DB");
            }


            FileFilter directoryFilter = new FileFilter() {

                public boolean accept(File file) {
                    return file.isDirectory();
                }
            };

            //default index on odocument
            db.begin();


            ODocument oArtist = new ODocument(db, "artist");
            oArtist.field("name", "Various", OType.STRING);
            oArtist.save();

            db.getMetadata().getSchema().getClass("artist").createProperty("name", OType.STRING).createIndex(INDEX_TYPE.FULLTEXT);
            db.getMetadata().getSchema().save();


            ODocument oTrack = new ODocument(db, "track");
            oTrack.field("title", "Various", OType.STRING);
            oTrack.save();

            db.getMetadata().getSchema().getClass("track").createProperty("title", OType.STRING).createIndex(INDEX_TYPE.FULLTEXT);
            db.getMetadata().getSchema().save();


            ODocument oGendr = new ODocument(db, "genre");
            oGendr.field("name", "Various", OType.STRING);
            oGendr.save();

            db.getMetadata().getSchema().getClass("genre").createProperty("name", OType.STRING).createIndex(INDEX_TYPE.UNIQUE);
            db.getMetadata().getSchema().save();


            ODocument oYear = new ODocument(db, "year");
            oYear.field("data", "19000101", OType.DATE);
            oYear.save();

            db.getMetadata().getSchema().getClass("year").createProperty("data", OType.DATE).createIndex(INDEX_TYPE.UNIQUE);
            db.getMetadata().getSchema().save();


            db.commit();


            File[] dirs = new File(root_dir).listFiles(directoryFilter);
            int i = 0;
            for (File dir : dirs) {

                if (!dir.isDirectory()) {
                    continue;
                }

                System.out.println("" + dir);

                int max_file_for_debug = 0;
                Collection files = FileUtils.listFiles(dir, null, true);
                for (File file : files) {
                    if (file.getName().startsWith(".")) {
                        continue;
                    }


                    try {
                        List lines = FileUtils.readLines(file);

                        db.begin();
                        ODocument oDisk = new ODocument(db, "disk");

                        ArrayList tracks = new ArrayList();

                        String titles = "";
                        String extd = "";
                        for (String line : lines) {

                            if (line.startsWith("# Disc length:")) {
                                String length = line.replaceAll("# Disc length:", "").replaceAll("seconds", "").replaceAll("secs", "").trim();

                                oDisk.field("Disc Length", length, OType.INTEGER);
                            }

                            if (line.startsWith("# Revision:")) {
                                String revision = line.replaceAll("# Revision:", "").trim();


                                oDisk.field("revision", revision, OType.INTEGER);
                            }


                            if (line.startsWith("#")) {
                                continue;
                            }

                            String ele[] = line.split("=");

                            if (ele == null || ele.length == 1) {
                                continue;
                            }

                            String key = ele[0];
                            String value = ele[1];


                            if (key.equals("DISKID")) {
                                oDisk.field(key.toLowerCase(), value, OType.STRING);
                            }

                            if (key.equals("DYEAR")) {
                                oDisk.field("year", check_and_create_year(value + "0101", db), OType.LINK);
                            }

                            if (key.equals("DGENRE")) {

                                oDisk.field("genre", check_and_create_genre(value, db), OType.LINK);
                            }

                            //concatenate multiple title lines
                            if (key.equals("DTITLE")) {
                                titles += value;
                            }

                            if (key.equals("EXTD")) {
                                extd += value;
                            }



                            //tracks list
                            if (key.startsWith("TTITLE")) {
                                oTrack = new ODocument(db, "track");
                                oTrack.field("n", key.replaceAll("TTITLE", ""), OType.INTEGER);

                                String tartist = "";


                                oTrack.field("title", getTitle(value));

                                tartist = getAuthor(value);

                                if (!tartist.equals("")) {

                                    oTrack.field("artist", check_and_create_artist(tartist, db), OType.LINK);
                                } else {
                                    oTrack.field("artist");
                                }


                                oTrack.save();
                                tracks.add(oTrack);

                            }

                        }

                        //add track_list
                        if (!tracks.isEmpty()) {
                            oDisk.field("tracks", tracks, OType.EMBEDDEDLIST);
                        }

                        //title and artist disk
                        if (!titles.equals("")) {

                            oDisk.field("title", getTitle(titles));
                            oDisk.field("artist", check_and_create_artist(getAuthor(titles), db), OType.LINK);

                        }

                        //title and artist disk
                        if (!extd.equals("")) {
                            oDisk.field("extd", extd);
                        }

                        oDisk.save();
                        db.commit();

                    } catch (IOException ex) {
                        System.out.println("ex (1):" + ex.getMessage() + ex.getStackTrace().toString());

                        for (StackTraceElement s : ex.getStackTrace()) {
                            System.out.println("" + s);
                        }

                        db.rollback();
                        continue;

                    }


                    i++;
                    if ((i >= 1000) && (i % 1000) == 1) {
                        System.out.println("\t" + file);
                        for (String s : db.getClusterNames()) {
                            System.out.println("cluster: " + s + " - " + db.countClusterElements(s));

                        }

                        //for debug max 1000 file for folder
                        break;
                    }

                }

            }


            db.close();

            //server
            server.shutdown();
        } catch (Exception ex) {
            System.out.println("ex (2):" + ex.getMessage() + ex.getStackTrace().toString());


            for (StackTraceElement s : ex.getStackTrace()) {
                System.out.println("" + s);
            }

        }

    }

    private ODocument check_and_create_artist(String name, ODatabaseDocumentTx db) {


        if (name.equals("")) {
            return null;
        }


        OSQLSynchQuery query = new OSQLSynchQuery("select from artist where name = ?");
        List result = db.command(query).execute(name);


        if (!result.isEmpty()) {
            return (ODocument) result.get(0);

        } else {

            ODocument oArtist = new ODocument(db, "artist");
            oArtist.field("name", /*a*/ name, OType.STRING);
            oArtist.save();

            return oArtist;
        }


    }

    private ODocument check_and_create_year(String year, ODatabaseDocumentTx db) {

        if (cache_year.containsKey(year)) {
            return cache_year.get(year);
        }


        OSQLSynchQuery query = new OSQLSynchQuery("select from year where data = ?");
        List result = db.command(query).execute(year);


        if (!result.isEmpty()) {
            cache_year.put(year, (ODocument) result.get(0));
            return (ODocument) result.get(0);

        } else {

            ODocument oYear = new ODocument(db, "year");
            oYear.field("data", year, OType.DATE);
            oYear.save();
            cache_year.put(year, oYear);

            return oYear;
        }


    }

    private ODocument check_and_create_genre(String genre, ODatabaseDocumentTx db) {

        if (cache_genre.containsKey(genre)) {
            return cache_genre.get(genre);
        }

        OSQLSynchQuery query = new OSQLSynchQuery("select from genre where name = ?");
        List result = db.command(query).execute(genre);

        if (!result.isEmpty()) {
            cache_genre.put(genre, (ODocument) result.get(0));
            return (ODocument) result.get(0);

        } else {

            ODocument oGenre = new ODocument(db, "genre");
            oGenre.field("name", genre, OType.STRING);
            oGenre.save();
            cache_genre.put(genre, oGenre);
            return oGenre;
        }


    }

    private String getTitle(String value) {


        if (value.indexOf("/") == -1) {
            return escape(value);
        }

        try {
            return escape(value.split("/")[0]);
        } catch (Exception e) {
            return "";
        }

    }

    private String getAuthor(String value) {

        if (value.indexOf("/") == -1) {
            return "";
        }

        if (value.split("/").length == 0) {
            return "";
        }

        try {
            return escape(value.split("/")[1]);
        } catch (Exception e) {
            return "";
        }


    }

    private String escape(String s) {
        if (s == null) {
            return s;
        }
        return s.trim().replaceAll("\\[", "").replaceAll("\\]", "").replaceAll("'", "\\\\'");
    }



Lib:

  • commons-io-2.0.1.jar
  • orient-commons-1.0rc2-SNAPSHOT.jar
  • orientdb-client-1.0rc2-SNAPSHOT.jar
  • orientdb-core-1.0rc2-SNAPSHOT.jar
  • orientdb-enterprise-1.0rc2-SNAPSHOT.jar
  • orientdb-server-1.0rc2-SNAPSHOT.jar
  • orientdb-tools-1.0rc2-SNAPSHOT.jar
  • persistence-api-1.0.jar

after several hours...


cluster: internal - 3
cluster: index - 882
cluster: default - 0
cluster: orole - 3
cluster: ouser - 3
cluster: artist - 43767
cluster: track - 155602
cluster: genre - 995
cluster: year - 114
cluster: disk - 11001



about 4GB of db...

Test Query:


first time:
query:select from artist name like 'Pink%' tot time:25692 ms
next:
query:select from artist name like 'Pink%' tot time:1646 ms

first time:
query:select from disk where artist.name like 'Pink%' tot time: 13388 ms
next:
query:select from disk where artist.name like 'Pink%' tot time: 4714 ms

first time:
query:select from disk where tracks contains ( artist.name like 'Pink%' ) tot time: 1628 ms
next:
query:select from disk where tracks contains ( artist.name like 'Pink%' ) tot time: 1481 ms

first/next time:
query:select from disk where year.data = '19780101' tot time: 906


1 commento:

Luca Garulli ha detto...

Hi Marco,
unfortunately all the indexes against STRING types don't support the search with LIKE yet, so it's always linear.

I'm pretty sure that exact match should be much much faster:

select from artist where name = 'Pink'

should be much faster than:

select from artist name like 'Pink%'

Lvc@