/ - Diff - D-Net - D-Net project tracking tool

     package eu.dnetlib.data.mapreduce.hbase.lodExport.linkage;
     import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.MyComparator;
     import eu.dnetlib.data.mapreduce.hbase.lodExport.utils.RedisUtils;
     import org.aksw.limes.core.io.cache.MemoryCache;
     import org.aksw.limes.core.io.config.reader.xml.XMLConfigurationReader;
     import org.aksw.limes.core.io.preprocessing.Preprocessor;
     import org.apache.hadoop.io.Text;
     import org.apache.hadoop.mapreduce.Reducer;
     import org.apache.log4j.Logger;
     import java.io.IOException;
     import java.util.HashMap;
     import java.util.Map;
     public class LinkCustomReducer extends Reducer<Text, Text, Text, Text> {
         private Logger log = Logger.getLogger(LinkCustomReducer.class);
         private static final String OPTIMAL_BLOCKS = "optimalBlockSize";
         private long optimalBlockSize;
         private RedisUtils redisUtils;
         public static enum LINK_REDUCER_COUNTERS {
             TARGET_TRIPLES,
             SOURCE_TRIPLES,
             WRITTEN_OUT_ENTITIES
+        }
         @Override
         protected void setup(Context context) throws IOException, InterruptedException {
             try {
                 redisUtils = new RedisUtils(context);
                 optimalBlockSize = Integer.valueOf(redisUtils.getValue(OPTIMAL_BLOCKS));
                 log.info("OPTIMAL BLOCK SIZE " + optimalBlockSize);
             } catch (Exception e) {
                 log.error("Error Setting up reducer stats" + e.toString());
                 throw new RuntimeException("Error Setting up reducr stats", e);
+            }
+        }
         @Override
         protected void reduce(final Text key, final Iterable<Text> values, final Context context) throws IOException, InterruptedException {
             //each item in the list is a block with the given key
             Map<String, String> sourceRecords = new HashMap<>();
             Map<String, String> targetRecords = new HashMap<>();
             for (Text block : values) {
                 try {
                     context.getCounter(LimesReducer.LIMES_COUNTERS.LINKED_BLOCKS).increment(1);
                     sourceRecords.clear();
                     targetRecords.clear();
                 } catch (Exception e) {
                     log.error(e.toString(), e);
                     throw new IOException(e);
+                }
+            }
+        }
         @Override
         protected void cleanup(Context context) throws IOException, InterruptedException {
+        }
         private void fillLimesCache(Text block, Context context) throws Exception {
             String[] split = block.toString().split(SEPERATOR);
             for (String recordId : split) {
                 String record = redisUtils.getValue(recordId);
                 if (record == null) {
                     log.error("Record " + recordId + " not found! ");
                     throw new Exception("Record " + recordId + " not found! ");
+                }
                 record = record.substring(record.indexOf(FIELD_DELIM) + 1).trim();
                 String[] Triples = record.split(LINE_SEPERATOR);
                 for (String triple : Triples) {
                     String[] Fields = triple.split(FIELD_DELIM);
                     String subject = Fields[0];
                     //props in sourceKB do not contain <> ,so we need to replace them.
                     String property = Fields[1].replaceAll("[<>]", "");
                     String value = Fields[2];
                     if (sourceKb.getProperties().contains(property) || targetKb.getProperties().contains(property)) {
                         if (recordId.contains("source_")) {
                             for (String propertyDub : sourceKb.getFunctions().get(property).keySet()) {
                                 String processedValue = Preprocessor.process(value, sourceKb.getFunctions().get(property).get(propertyDub));
                                 sourceCache.addTriple(subject, propertyDub, processedValue);
+                            }
                             context.getCounter(LimesReducer.LIMES_COUNTERS.SOURCE_TRIPLES).increment(1);
                         } else {
                             for (String propertyDub : targetKb.getFunctions().get(property).keySet()) {
                                 String processedValue = Preprocessor.process(value, targetKb.getFunctions().get(property).get(propertyDub));
                                 targetCache.addTriple(subject, propertyDub, processedValue);
+                            }
                             context.getCounter(LimesReducer.LIMES_COUNTERS.TARGET_TRIPLES).increment(1);
+                        }
+                    }
+                }
+            }
+        }
         private void compareRecords(Map<String, String> sourceRecords, Map<String, String> targetRecords, Context context) throws IOException, InterruptedException {
             for (String sourceId : sourceRecords.keySet()) {
                 //TODO modify Redis to use hmset/ mget field in order to save records as a map and retrieve
                 // fields individually as needed ( only those needed for comparison)
                 String sourceRecord = (String) connection.get(sourceId);
                 for (String targetId : targetRecords.keySet()) {
                     String targetRecord = (String) connection.get(targetId);
                     double similarity = MyComparator.findMatchingPair(sourceRecord, targetRecord);
                     if (similarity > 0) {
                         context.write(new Text(sourceId), new Text(targetId + "," + similarity));
                         context.getCounter(LIMES_COUNTERS.WRITTEN_OUT_ENTITIES).increment(1);
+                    }
+                }
+            }
+        }
+    }

             System.out.println(Arrays.toString(myLine.split(" ")));
+        }
         @org.junit.Test
         public void testType()
+        {
             String id = "<http://dblp.l3s.de/d2r/resource/publications/journals/ijdsn/DanZYY13>";
             String s = id.substring(0, id.indexOf('D'));
             System.out.println(s);
+        }
         @org.junit.Test
         public void writeToRedis() throws Exception {

     public class MyComparator {
         private static double FIELDS_SIMILARITY_THRESHOLD = 0.7;
         private static double RECORD_SIMILARITY_THRESHOLD = 0.8;
         private static String SEPARATOR = ",";
         private static final String LINE_SEPERATOR = "\t.\t";
         private static final String FIELD_DELIM = "\t";
         private static Map<String, String> sourceRecordMappings = new HashMap<>();
         static {
             //TODO remove later!!! make it configurable
             sourceRecordMappings.put("http://www.eurocris.org/ontologies/cerif/1.3#name",
                     "http://www.w3.org/2000/01/rdf-schema#label");
             sourceRecordMappings.put("http://lod.openaire.eu/vocab/year", "http://purl.org/dc/terms/issued");
             sourceRecordMappings.put("<http://www.eurocris.org/ontologies/cerif/1.3#name>",
                     "<http://www.w3.org/2000/01/rdf-schema#label>");
             sourceRecordMappings.put("<http://lod.openaire.eu/vocab/year>", "<http://purl.org/dc/terms/issued>");
+        }
         public static double findMatchingPair(String source, String target) {
             Map<String, String> sourceRecordMap = getRecordsFiledMap(source);
             Map<String, String> targetRecordMap = getRecordsFiledMap(target);
             String[] sourceFields = source.split(SEPARATOR);
             Map<String, String> sourceFieldsMap = new HashMap<>();
             for (int j = 0; j < sourceFields.length; j++) {
                 String[] split = sourceFields[j].split("\t");
                 sourceFieldsMap.put(split[0], split[1]);
+            }
             //get target fields
             String[] targetFields = target.split(",");
             Map<String, String> targetFieldsMap = new HashMap<>();
             for (int j = 0; j < targetFields.length; j++) {
                 String[] split = targetFields[j].split("\t");
                 targetFieldsMap.put(split[0], split[1]);
+            }
             //similarity counters
             int matchedFields = 0;
             double totalFields = (double) targetFields.length;
             double totalFields = (double) sourceRecordMappings.size();
             double recordSimilarity = 0.0;
             for (Map.Entry<String, String> sourceField : sourceFieldsMap.entrySet()) {
             for (Map.Entry<String, String> sourceField : sourceRecordMap.entrySet()) {
                 String correspondingTargetField = sourceRecordMappings.get(sourceField.getKey());
                 String targetFieldValue = targetFieldsMap.get(correspondingTargetField);
                 String targetFieldValue = targetRecordMap.get(correspondingTargetField);
                 double fieldsSimilarity = compare(sourceField.getValue(), targetFieldValue);
                 System.out.println(sourceField + "\n" + targetFieldValue + "\n : field similarity: " + fieldsSimilarity + "\n-----------------------------------------");
                 recordSimilarity += fieldsSimilarity;
-...
+                }
+            }
             //  System.out.println("Similar chars " + similarChars);
             return (sourceValue.length() >= targetValue.length() ? similarChars / (double) sourceValue.length() :  similarChars / (double)targetValue.length());
             return (sourceValue.length() >= targetValue.length() ? similarChars / (double) sourceValue.length() : similarChars / (double) targetValue.length());
+        }
         private static Map<String, String> getRecordsFiledMap(String source) {
             String sourceRecord = source.substring(source.indexOf(FIELD_DELIM) + 1).trim();
             String[] sourceTriples = sourceRecord.split(LINE_SEPERATOR);
             Map<String, String> sourceFieldsMap = new HashMap<>();
             for (String sourceTriple : sourceTriples) {
                 String[] split = sourceTriple.split(FIELD_DELIM);
                 sourceFieldsMap.put(split[0], split[1]);
+            }
             return sourceFieldsMap;
+        }
+    }

modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/linkage/LimesReducer.java
96	96	throw new Exception("Record " + recordId + " not found! ");
97	97
98	98	}
99
100	99	record = record.substring(record.indexOf(FIELD_DELIM) + 1).trim();
101	100	String[] Triples = record.split(LINE_SEPERATOR);
102	101	for (String triple : Triples) {

Project

General

Profile

D-Net

Revision 45806

Added by Eri Katsari almost 8 years ago