Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Extract all URL links out of Google Slides presentation with Google Apps Script

I am trying to create a function that, when passed a Google Slides presentation ID, can parse the presentation and write all URL links it finds to a Google Sheet. I have built adapted the following function to do the same for a Google Docs document input based off of this answer from @Yuval

function getAllLinks(docId, mergeAdjacent) {
  var links = [];

  //var doc = DocumentApp.getActiveDocument();
  var doc = DocumentApp.openById(docId);
  var parentDocName = doc.getName();
  var ss=SpreadsheetApp.getActive();
  var sh=ss.getSheetByName('Extracted Links');

  iterateSections(doc, function(section, sectionIndex, isFirstPageSection) {
    if (!("getParagraphs" in section)) {
      // as we're using some undocumented API, adding this to avoid cryptic
      // messages upon possible API changes.
      throw new Error("An API change has caused this script to stop " + 
                      "working.\n" +
                      "Section #" + sectionIndex + " of type " + 
                      section.getType() + " has no .getParagraphs() method. " +
        "Stopping script.");
    }

    section.getParagraphs().forEach(function(par) { 
      // skip empty paragraphs
      if (par.getNumChildren() == 0) {
        return;
      }

      // go over all text elements in paragraph / list-item
      for (var el=par.getChild(0); el!=null; el=el.getNextSibling()) {
        if (el.getType() != DocumentApp.ElementType.TEXT) {
          continue;
        }

        // go over all styling segments in text element
        var attributeIndices = el.getTextAttributeIndices();
        var lastLink = null;
        attributeIndices.forEach(function(startOffset, i, attributeIndices) { 
          var url = el.getLinkUrl(startOffset);

          if (url != null) {
            // we hit a link
            var endOffsetInclusive = (i+1 < attributeIndices.length? 
                                      attributeIndices[i+1]-1 : null);

            // check if this and the last found link are continuous
            if (mergeAdjacent && lastLink != null && lastLink.url == url && 
                  lastLink.endOffsetInclusive == startOffset - 1) {
              // this and the previous style segment are continuous
              lastLink.endOffsetInclusive = endOffsetInclusive;
              return;
            }

            lastLink = {
              "section": section,
              "isFirstPageSection": isFirstPageSection,
              "paragraph": par,
              "textEl": el,
              "startOffset": startOffset,
              "endOffsetInclusive": endOffsetInclusive,
              "url": url
            };
            var row = sh.getLastRow() + 1;
            var r1=sh.getRange(row, 1);
            r1.setValue(parentDocName);
            var r2=sh.getRange(row, 2);
            r2.setValue(url);
            Logger.log(parentDocName)
            Logger.log(url)
            links.push(lastLink);
          }        
        });
      }
    });
  });


  return links;
}

/**
 * Calls the given function for each section of the document (body, header, 
 * etc.). Sections are children of the DocumentElement object.
 *
 * @param {Document} doc The Document object (such as the one obtained via
 *     a call to DocumentApp.getActiveDocument()) with the sections to iterate
 *     over.
 * @param {Function} func A callback function which will be called, for each
 *     section, with the following arguments (in order):
 *       - {ContainerElement} section - the section element
 *       - {Number} sectionIndex - the child index of the section, such that
 *         doc.getBody().getParent().getChild(sectionIndex) == section.
 *       - {Boolean} isFirstPageSection - whether the section is a first-page
 *         header/footer section.
 */
function iterateSections(doc, func) {
  // get the DocumentElement interface to iterate over all sections
  // this bit is undocumented API
  var docEl = doc.getBody().getParent();

  var regularHeaderSectionIndex = (doc.getHeader() == null? -1 : 
                                   docEl.getChildIndex(doc.getHeader()));
  var regularFooterSectionIndex = (doc.getFooter() == null? -1 : 
                                   docEl.getChildIndex(doc.getFooter()));

  for (var i=0; i<docEl.getNumChildren(); ++i) {
    var section = docEl.getChild(i);

    var sectionType = section.getType();
    var uniqueSectionName;
    var isFirstPageSection = (
      i != regularHeaderSectionIndex &&
      i != regularFooterSectionIndex && 
      (sectionType == DocumentApp.ElementType.HEADER_SECTION ||
       sectionType == DocumentApp.ElementType.FOOTER_SECTION));

    func(section, i, isFirstPageSection);
  }
}

When I try to create the same for a Google Slides presentation as the input, I am getting stuck on the step of how to parse through the document and extract all of the text bits (in order to check them for links). It seems like I would need to use getSlides(), and then getPageElements() and iterate through those, but I am unclear on how to get to the actual text on the slides. Any tips on how to get iterate through the actual text on slides (and potentially how to extract the link URL out of that text if it has one) would be much appreciated. Thank you!

like image 912
Gabriel Tero Avatar asked Sep 20 '25 11:09

Gabriel Tero


1 Answers

If you just wan't to get the links from the slides, see the code below:

Code:

function getLinksFromSlides() {
  var presentation = SlidesApp.getActivePresentation();
  var slides = presentation.getSlides();
  // traverse each slide
  slides.forEach(function (slide) {
    var shapes = slide.getShapes();
    // traverse each shape
    shapes.forEach(function (shape) {
      // get its text content
      var textRange = shape.getText();
      var links = textRange.getLinks();
      // print all links found
      links.forEach(link => Logger.log(link.getTextStyle().getLink().getUrl()));
    });
  });
}

Sample:

sample

Output:

output

Note:

  • This only extracts the hyperlinks. It doesn't extract any links/url that isn't hyperlinked as shown in the sample data. (e.g. https://www.facebook.com)
  • If you want the non-hyperlink urls, then you might have to try regex.
like image 151
NightEye Avatar answered Sep 23 '25 00:09

NightEye