# This BibTeX File has been generated by # the Typo3 extension 'Sixpack-4-T3 by Sixten Boeck' # # URL: # Date: 05/26/2017 # Non-Standard BibTex fields are included. # state: 0 = published, 1 = accepted, 2 = submitted, 3 = to be published // if missing, published is assumed # extern,deleted,hidden: 0 = false, 1 = true // if missing, false is assumed # link format: Title Url // separated by a whitespace @article{AnnealingStandard, author = { Tobias Vogel, Arvid Heise, Uwe Draisbach, Dustin Lange, Felix Naumann }, title = { Reach for Gold: An Annealing Standard to Evaluate Duplicate Detection Results }, journal = { JDIQ }, year = { 2014 }, volume = { 5 }, number = { 1-2 }, project = { stratosphere }, files = { fileadmin/user_upload/fachgebiete/naumann/publications/2014/AnnealingStandard.pdf }, sorting = { 11264 } } @article{journals_dbsk_LangeVDN11, author = { Dustin Lange, Tobias Vogel, Uwe Draisbach, Felix Naumann }, title = { Projektseminar "Similarity Search Algorithms" }, journal = { Datenbank-Spektrum }, year = { 2011 }, volume = { 11 }, number = { 1 }, pages = { 51-57 }, project = { SimilaritySearch;HPI }, link1 = { PDF fileadmin/user_upload/fachgebiete/naumann/publications/2011/SimilaritySearchDBSP.pdf }, sorting = { 58880 } } @inproceedings{ConsensusClusteringVogelNaumannDINA2014, author = { Tobias Vogel, Felix Naumann }, title = { Semi-Supervised Consensus Clustering: Reducing Human Effort }, year = { 2014 }, abstract = { Machine-based clustering yields fuzzy results. For example, when detecting duplicates in a dataset, different tools might end up with different clusterings. Eventually, a decision needs to be made, defining which records are in the same cluster, i. e., are duplicates. Such a definitive result is called a Consensus Clustering and can be created by evaluating the clustering attempts against each other and only resolving the disagreements by human experts. Yet, there can be different consensus clusterings, depending on the choice of disagreements presented to the human expert. In particular, they may require a different number of manual inspections. We present a set of strategies to select the smallest set of manual inspections to arrive at a consensus clustering and evaluate their efficiency on a set of real-world and synthetic datasets. }, booktitle = { Proceedings of the International Workshop on Data Integration and Applications }, files = { fileadmin/user_upload/fachgebiete/naumann/publications/2014/SemiSupervisedConsensusClustering.pdf }, sorting = { 9728 } } @inproceedings{UnigramblockingVogelNaumann2012, author = { Tobias Vogel, Felix Naumann }, title = { Automatic Blocking Key Selection for Duplicate Detection based on Unigram Combinations }, year = { 2012 }, abstract = { Duplicate detection is the process of identifying multiple but different representations of same real-world objects, which typically involves a large number of comparisons. Partitioning is a well-known technique to avoid many unnecessary comparisons. However, partitioning keys are usually handcrafted, which is tedious and the keys are often poorly chosen. We propose a technique to find suitable blocking keys automatically for a dataset equipped with a gold standard. We then show how to re-use those blocking keys for datasets from similar domains lacking a gold standard. Blocking keys are created based on unigrams, which we extend with length-hints for further improvement. Blocking key creation is accompanied with several comprehensive experiments on large artificial and real-world datasets. }, booktitle = { Proceedings of the 10th International Workshop on Quality in Databases (QDB) in conjunction with VLDB }, project = { HPI }, files = { fileadmin/user_upload/fachgebiete/naumann/publications/2012/Unigram_Blocking_20Tobias_20Vogel__20Felix_20Naumann.pdf }, sorting = { 23296 } } @inproceedings{Instancebasedonetosomeassignmentofsimilaritymeasurestoattributescoopis2011vogelnaumann, author = { Tobias Vogel, Felix Naumann }, title = { Instance-based "one-to-some" Assignment of Similarity Measures to Attributes }, year = { 2011 }, abstract = { Data quality is a key factor for economical success. It is usually defined as a set of properties of data, such as completeness, accessibility, relevance, and conciseness. The latter includes the absence of multiple representations for same real world objects. To avoid such duplicates, there is a wide range of commercial products and customized self-coded software. These programs can be quite expensive both in acquisition and maintenance. In particular, small and medium-sized companies cannot afford these tools. Moreover, it is difficult to set up and tune all necessary parameters in these programs. Recently, web-based applications for duplicate detection have emerged. However, they are not easy to integrate into the local IT landscape and require much manual configuration effort. With DAQS (Data Quality as a Service) we present a novel approach to support duplicate detection. The approach features (1) minimal required user interaction and (2) self-configuration for the provided input data. To this end, each data cleansing task is classified to find out which metadata is available. Next, similarity measures are automatically assigned to the provided records’ attributes and a duplicate detection process is carried out. In this paper we introduce a novel matching approach, called one-to-some or 1:k assignment, to assign similarity measures to attributes. We performed an extensive evaluation on a large training corpus and ten test datasets of address data and achieved promising results. }, booktitle = { Proceedings of the 19th International Conference on Cooperative Information Systems (CoopIS) }, project = { HPI }, link1 = { PDF fileadmin/user_upload/fachgebiete/naumann/publications/2011/Vogel__Naumann__Instance-based_one-to-some_assignment_of_similarity_measures_to_attributes.pdf }, sorting = { 26624 } } @inproceedings{dqws, author = { Tobias Vogel }, title = { Self-Adaptive Data Quality Web Services }, year = { 2010 }, address = { Bad Helmstedt }, booktitle = { Grundlagen von Datenbanken }, project = { HPI }, link1 = { Self-Adaptive Data Quality Web Services fileadmin/user_upload/fachgebiete/naumann/publications/2010/self-adaptive_data_quality_web_services.pdf }, sorting = { 28928 } } @inproceedings{posr, author = { Mohammed AbuJarour, Mircea Craculeac, Falko Menge, Tobias Vogel, Jan-Felix Schwarz }, title = { POSR: A Comprehensive System for Aggregating and Using Web Services (demo) }, year = { 2009 }, abstract = { Recently, the number of public Web Services has been constantly increasing. Nevertheless, consuming Web Services as an end-user is not straightforward, because creating a suitable user interface for consuming a Web Service requires much effort. In this work, we introduce a novel approach where user interface fragments for consuming Web Services are generated automatically, and aggregated and customized by end-users to match their preferences. Users can collaboratively improve the auto-generated user interfaces and share them among each other. Our three main sources of Web Services are explicit registration, automatic identification and collecting over the Web, as well as extraction and generation from existing web applications. We validated our approach by implementing it as a comprehensive system coined “Posr”. }, booktitle = { Proceedings of the IEEE Services Cup 2009 at IEEE International Conference on Web Services (ICWS) }, project = { HPI }, link1 = { POSR: A Comprehensive System for Aggregating and Using Web Services fileadmin/user_upload/fachgebiete/naumann/publications/2009/posr-paper.pdf }, sorting = { 29184 } } @inproceedings{conf_icsoc_VogelKN09, author = { Tobias Vogel, Frank Kaufer, Felix Naumann }, title = { Encapsulating Multi-stepped Web Forms as Web Services }, year = { 2009 }, pages = { 488-497 }, abstract = { HTML forms are the predominant interface between users and web applications. Many of these applications display a sequence of multiple forms on separate pages, for instance to book a flight or order a DVD. We introduce a method to wrap these multi-stepped forms and offer their individual functionality as a single consolidated Web Service. This Web Service in turn maps input data to the individual forms in the correct order. Such consolidation better enables operation of the forms by applications and provides a simpler interface for human users. To this end we analyze the HTML code and sample user interaction of each page and infer the internal model of the application. A particular challenge is to map semantically same fields across multiple forms and choose meaningful labels for them. Web Service output is parsed from the resulting HTML page. Experiments on different multi-stepped web forms show the feasibility and usefulness of our approach. }, booktitle = { Proceedings of the 7th International Conference on Service-Oriented Computing (ICSOC) }, project = { HPI }, link1 = { Encapsulating Multi-stepped Web Forms as Web Services fileadmin/user_upload/fachgebiete/naumann/publications/2009/Faster-Paper.pdf }, sorting = { 54784 } }